Compare commits

...

2003 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
dd47c8f084 docs/CHANGELOG.md: cut v1.50.0 release 2020-12-15 14:45:00 +02:00
Aliaksandr Valialkin
a66af20686 .github/workflows/main.yml: fall back to go get instead of go install for installing aux tools
It is unclear why `go install` doesn't work in Github Actions. Needs additional investigation.
The following error is returned now:

cannot find package "golang.org/x/lint/golint" in any of:
	/opt/hostedtoolcache/go/1.15.5/x64/src/golang.org/x/lint/golint (from $GOROOT)
	/home/runner/go/src/golang.org/x/lint/golint (from $GOPATH)
2020-12-15 14:19:11 +02:00
Aliaksandr Valialkin
eddc2bd017 lib/promscrape: properly handle scrape errors when stream parsing is enabled
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/967
2020-12-15 14:10:52 +02:00
Aliaksandr Valialkin
8d1031c29a app/vmselect/promql: return expected increase() result for the first point on the graph with value not exceeding 100 2020-12-15 14:10:50 +02:00
Aliaksandr Valialkin
3f88e27d0f Do not set GO111MODULE=off during go install, since this doesnt work in Go1.14 and Go1.15 2020-12-15 13:14:41 +02:00
Aliaksandr Valialkin
aca669c89c docs/CHANGELOG.md: mention that vmagent now accepts multiple -remoteWrite.sendTimeout and -remoteWrite.tlsInsecureSkipVerify command-line flags 2020-12-15 12:59:34 +02:00
Nikolay
7064c4eb8e adds new Array Flags (#965)
* adds ArrayDuration and ArrayBool flags,
makes sendTimeout and tlsInsecure configurable per remoteWrite url

* added backward compatibility testcases for ArrayDuration and ArrayBool

* fixes bool flag

* fixes test cases
2020-12-15 12:59:33 +02:00
Aliaksandr Valialkin
104aac170e lib/promscrape: add bootstrap styles to /targets html page 2020-12-15 12:38:29 +02:00
Aliaksandr Valialkin
ad961fe1f1 lib/promscrape: formatting fixes for /tarets page 2020-12-15 11:59:28 +02:00
Aliaksandr Valialkin
38145cfbb8 lib/promscrape: formatting fixes for /targets page 2020-12-15 11:27:22 +02:00
Aliaksandr Valialkin
e17ac90f59 .github/workflows/main.yml: set GO111MODULE=off when installing auxiliary tools via go install 2020-12-15 01:02:13 +02:00
Aliaksandr Valialkin
e85159813f docs/CHANGELOG.md: mention about adding query, first and value functions to alert templates
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/539
2020-12-14 20:17:46 +02:00
Roman Khavronenko
9f578e389c vmalert: add function "query", "first" and "value" to alert templates functions (#960)
The commit adds a support for template function `query`,
`first` and `value`. The function `query` executes
a MetricsQL query for active alerts. In vmalert we
update templates on every evaluation for active alerts
to keep them up to date. With `query` func it may become
a perf issue since it will fire a query on every execution.
We should keep it in mind for now.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/539
2020-12-14 20:12:16 +02:00
Aliaksandr Valialkin
dafef21001 all: use go install instead of go get for installing auxiliary tools
This is a preparation for Go 1.16, which deprecates `go get` for installing binaries.
See https://tip.golang.org/doc/go1.16#go-command :

  go install, with or without a version suffix (as described above), is now the recommended way
  to build and install packages in module mode. go get should be used with the -d flag to adjust
  the current module's dependencies without building packages, and use of go get to build and install
  packages is deprecated. In a future release, the -d flag will always be enabled.
2020-12-14 20:07:20 +02:00
Aliaksandr Valialkin
09a03b862d docs/CHANGELOG.md: mention bugfix for proper recovering from incorrectly stored persistent queue
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/964
2020-12-14 19:29:23 +02:00
Aliaksandr Valialkin
d98a2f217b lib/persistentqueue: verify that ReaderOffset doesnt exceed WriterOffset when opening the persistent queue
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/964
2020-12-14 19:25:53 +02:00
Aliaksandr Valialkin
b904ae3722 lib/promscrape: add missing whitespace between duration and ago word at /targets page 2020-12-14 14:20:30 +02:00
Aliaksandr Valialkin
a2eb451de4 app/{vmagent,vminsert}: follow-up for ce8c2dd1f1: return /targets page in HTML when requested via web browser 2020-12-14 14:13:01 +02:00
Nikolay
324e3aa1a5 Changes targets api (#961)
* changes /targets api
adds html response if requester accepts text/html,
adds quick template for /targets api,
fixes pathPrefix for / requests

* changes namings

* renamed targets file

* Update app/victoria-metrics/main.go

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>

* adds trimspace to qtpl,
moves content-type for targets response closer to writer

* fixes bug with prefix

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-12-14 14:13:00 +02:00
Aliaksandr Valialkin
756fc6fc6c app/victoria-metrics: automatically reset response cache when samples with too timestamps older than now - search.cacheTimestampOffset are ingested 2020-12-14 13:10:16 +02:00
Aliaksandr Valialkin
c0db28cd9a docs/MetricsQL.md: clarify that limitk(k, q) returns an arbitrary set of k time series with each call
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/951
2020-12-12 14:18:02 +02:00
Aliaksandr Valialkin
5a846f3e52 vendor: make vendor-update 2020-12-11 23:32:50 +02:00
Aliaksandr Valialkin
eed84ac2b5 docs/FAQ.md: use less confusing links in the chapter explaining why VictoriaMetrics doesnt support Prometheus remote_read API 2020-12-11 21:23:55 +02:00
Aliaksandr Valialkin
fc82c22e50 docs: consistently use links to https://victoriametrics.github.io for documentation references 2020-12-11 21:09:17 +02:00
Aliaksandr Valialkin
f1b303e70d docs/Single-server-VictoriaMetrics.md: clarify docs in Relabeling section 2020-12-11 18:24:31 +02:00
Aliaksandr Valialkin
3ec5387a36 docs/CHANGELOG.md: mention https://github.com/VictoriaMetrics/VictoriaMetrics/issues/955 2020-12-11 17:49:04 +02:00
Aliaksandr Valialkin
c80d38f00c lib/promscrape/discovery/consul: reduce load on Consul API server by increasing timeout for blocking requests from 50 seconds to 9 minutes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-12-11 17:26:34 +02:00
Aliaksandr Valialkin
d6f9bf2d19 app/vmselect/graphite: properly handle wildcards and charsets inside curly braces
For example, `foo{bar*,[a-f]a*b}` should match `foobar`, `foobar123`, `foofab`, etc.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/952
2020-12-11 17:26:32 +02:00
Aliaksandr Valialkin
9e79fc27c8 app/vminsert/netstorage: properly update vm_rpc_rerouted_rows_processed_total metric
Previously this metric wasn't updated because of improper defer call.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/955

Thanks to @xemxx for spotting the bug.
2020-12-11 13:07:05 +02:00
Aliaksandr Valialkin
82d26d9dfe docs/Single-server-VictoriaMetrics.md: sync with upstream README.md via make docs-sync 2020-12-11 12:09:17 +02:00
faceair
054ad542b0 docs/CaseStudies.md: add case study for zhihu (#956) 2020-12-11 12:09:16 +02:00
Aliaksandr Valialkin
4804e004f3 docs/Single-server-VictoriaMetrics.md: clarify that the recommended value for -dedup.minScrapeInterval is scrape_interval from Prometheus configs 2020-12-09 12:16:25 +02:00
Aliaksandr Valialkin
a52924c7a3 docs/CHANGELOG.md: mention about memory leak fix in vmagent when big number of targets is discovered via service discovery 2020-12-09 10:36:05 +02:00
Aliaksandr Valialkin
5b1c4f702e docs/Single-server-VictoriaMetrics.md: sync with upstream README.md via make docs-sync 2020-12-09 10:28:24 +02:00
Aliaksandr Valialkin
a84467958a lib/promscrape/discovery/consul: properly pass Datacenter filter to Consul API server
Previously it has been passed as `sdc` query arg, while it should be passed as `dc` query arg.
See https://www.consul.io/api-docs/health#list-nodes-for-service for details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574#issuecomment-740454170
2020-12-08 21:53:23 +02:00
Aliaksandr Valialkin
1a237c6903 all: properly handle CPU limits set on the host system/container
This can reduce memory usage on systems with enabled CPU limits.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/946
2020-12-08 21:07:03 +02:00
Aliaksandr Valialkin
38188e1d6b lib/promscrape: store ScrapeWork items by pointer in the slice returned from get*ScrapeWork()
This should prevent from possible 'memory leaks' when a pointer to ScrapeWork item stored in the slice
could prevent from releasing memory occupied by all the ScrapeWork items stored in the slice when they
are no longer used.

See the related commit e205975716 and the related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-12-08 17:55:21 +02:00
Aliaksandr Valialkin
bd8eef2528 app/vmselect/promql: do not reduce lookbehind window for any_rollup_func(m) to -search.maxStalenessInterval. It should equal to step value passed to /api/v1/query_range as most users expect 2020-12-08 15:17:05 +02:00
kreedom
996ba2770b Create CODE_OF_CONDUCT_RU.md 2020-12-08 15:16:59 +02:00
Aliaksandr Valialkin
7bdf07883b app/{vmalert,vmagent}: skip empty values in -remoteWrite.label and -label lists 2020-12-08 14:54:02 +02:00
Aliaksandr Valialkin
d5faad0240 lib/promscrape: re-use strings for labels stored in ScrapeWork
This should reduce memory usage when working with big number of scrape targets.
2020-12-08 12:23:44 +02:00
Aliaksandr Valialkin
06091cfdf8 lib/promscrape: export vm_promscrape_scrapers_{started|stopped}_total metrics for monitoring target churn rate 2020-12-08 11:58:44 +02:00
Aliaksandr Valialkin
affcee199c lib/promscrape: store targetStatus entries in targetStatusMap by pointer instead of by value
This guarantees that GC frees memory occupied by targetStatus after it is unregistered from targetStatusMap.
2020-12-08 11:52:20 +02:00
Aliaksandr Valialkin
56a0b058c1 lib/promscrape: export vm_promscrape_active_scrapers{type="<sd_type>"} metric for tracking the number of active scrapers per each service discovery type 2020-12-08 01:54:44 +02:00
Aliaksandr Valialkin
b5b32c65b0 lib/promscrape: do not enable strict config parsing when -promscrape.config.dryRun command-line flag is passed
Strict parsing for -promscrape.config can be enabled by passing `-promscrape.config.strictParse` command-line flag.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/944
2020-12-07 13:18:16 +02:00
Aliaksandr Valialkin
9660774fd1 app/vmselect/graphite: remove duplicate name tag from /tags/autoComplete/tags handler
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/942
2020-12-07 01:10:02 +02:00
Aliaksandr Valialkin
9d79d11a6c lib/promscrape: mention in scrape error message that scrape errors can be disabled by -promscrape.suppressScrapeErrors command-line flag 2020-12-06 23:27:07 +02:00
Roman Khavronenko
22baf8fe25 dashboard: release to grafana.com (#941) 2020-12-06 13:33:52 +02:00
Aliaksandr Valialkin
f6d32f99d7 lib/promscrape: clarify error message on failed connection to scrape target when -enableTCP6 command-line flag isn't set 2020-12-06 13:19:32 +02:00
Aliaksandr Valialkin
3d00613076 lib/protoparser/influx: allow multiple whitespace chars between measurement, fields and timestamp in Influx line protocol 2020-12-06 12:00:28 +02:00
Roman Khavronenko
b53cf5d083 dashboard: Prometheus compatibility fix for Storage full ETA panel (#939) 2020-12-06 01:19:20 +02:00
Aliaksandr Valialkin
80084d1827 docs/CHANGELOG.md: cut v1.49.0 2020-12-05 13:49:17 +02:00
Aliaksandr Valialkin
8dab061f51 vendor: make vendor-update 2020-12-05 12:45:41 +02:00
Aliaksandr Valialkin
f4f530d686 deployment/docker: update Go builder from v1.15.5 to v1.15.6
This fixes issues found in Go since v1.15.5 - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.6+label%3ACherryPickApproved
2020-12-05 12:38:14 +02:00
Aliaksandr Valialkin
d242c2f2bd app/vmselect/promql: add count_eq_over_time(m[d], N) and count_ne_over_time(m[d], N) for calculating the number of samples in m over d that are equal / not equal to N 2020-12-05 12:31:01 +02:00
Aliaksandr Valialkin
1430bbcf33 lib/promscrape/discoveryutils: remove limit on the number of concurrently running blocking queries
Too low limit could result in unexpected errors when performing big number of blocking queries.
2020-12-05 12:15:47 +02:00
Aliaksandr Valialkin
528587deef lib/flagutil: make golangci-lint happy by using strings.TrimPrefix instead of manual prefix removal via strings.HasPrefix 2020-12-03 22:07:26 +02:00
Aliaksandr Valialkin
bdac2171f1 all: do not print usage info for all the flags when incorrect command-line flag is passed
This should improve usability for VictoriaMetrics apps that have big number of command-line flags,
i.e. all the apps.
2020-12-03 21:46:19 +02:00
Aliaksandr Valialkin
8cf76d8747 app/vmselect/promql: add label_uppercase(q, label1, ... labelN) and label_lowercase(q, label1, ... labelN) functions
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/936
2020-12-03 21:46:18 +02:00
Aliaksandr Valialkin
732d354b90 vendor: make vendor-update 2020-12-03 20:21:54 +02:00
Aliaksandr Valialkin
96190f9d45 lib/promscrape/discovery/consul: log the time needed for stoppig Consul service watcher 2020-12-03 20:14:48 +02:00
Aliaksandr Valialkin
4e4a93c586 lib/promscrape/discovery/consul: make sure that block response contains X-Consul-Index header 2020-12-03 20:05:54 +02:00
Aliaksandr Valialkin
7a889f6850 lib/promscrape: code cleanup after c6dee6c52d
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-12-03 19:52:09 +02:00
Nikolay
0b302d33cb Changes consul discovery api (#921)
* adds consul watch api,
it must reduce load on consul service with blocking wait requests,
changed discoveryClient api with fetchResponseMeta callback.

* small fix

* fix after master merge

* adds watch client at discovery utils

* fixes consul watcher,
changes namings,
fixes data race

* small typo fix

* sanity fix

* fix naming and service node update
2020-12-03 19:52:08 +02:00
Aliaksandr Valialkin
fca915dcf3 docs/Single-server-VictoriaMetrics.md: update features chapter according to the latest developments 2020-12-03 13:01:46 +02:00
Aliaksandr Valialkin
45c402ad8a docs/Single-server-VictoriaMetrics.md: remove duplicate provide word 2020-12-03 09:48:54 +02:00
Aliaksandr Valialkin
11bbb3552d app/vmselect/promql: make fmt 2020-12-02 21:34:15 +02:00
Aliaksandr Valialkin
dd96714a2c docs/FAQ.md: add a link to https://valyala.medium.com/prometheus-vs-victoriametrics-benchmark-on-node-exporter-metrics-4ca29c75590f in performance comparisons section 2020-12-02 21:25:15 +02:00
Aliaksandr Valialkin
9e98a8f3d3 app/vmselect/promql: return nan from minute(m) when m equals to nan
This aligns VictoriaMetrics behaviour with Prometheus behaviour.

The issue has been spotted in https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 20:16:40 +02:00
Aliaksandr Valialkin
def513355e app/vmselect/promql: do not return 0 value from sum_over_time(m[d]) when there are no samples on the given d window.
This aligns the behaviour of `sum_over_time()` with other `_over_time()` functions and with Prometheus behavior.
2020-12-02 13:12:33 +02:00
Aliaksandr Valialkin
490c70a958 app/vmselect: return metric values from time() cmp_op metric query when cmp_op comparison is true
This aligns MetricsQL behavior to Prometheus' one.

The issue has been identified at https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 12:09:40 +02:00
Aliaksandr Valialkin
a8c5e2f0c5 vendor: upgrade github.com/VictoriaMetrics/metricsql from v0.7.2 to v0.7.3
This fixes parsing of hex numbers in MetricsQL such as 0x3b

The bug has been detected at https://promlabs.com/promql-compliance-test-results/2020-12-01/victoriametrics/
2020-12-02 08:11:42 +02:00
Aliaksandr Valialkin
e6bc08436c docs/Articles.md: add a link to https://victoriametrics.medium.com/how-to-monitor-go-applications-with-victoriametrics-c04703110870 2020-12-02 07:21:10 +02:00
Aliaksandr Valialkin
a34910e12c docs/Articles.md: add a link to an article on how Percona PMM has been migrated from Prometheus to VictoriaMetrics 2020-12-02 07:21:08 +02:00
Aliaksandr Valialkin
4ef7158e89 app/vmselect/promql: return nan from a >bool b if a is nan in the same way as Prometheus does 2020-12-02 00:28:56 +02:00
Aliaksandr Valialkin
adf45b730c app/vmselect/searchutils: return elapsed time in Deadline.String() output
This should improve debuggability for error messages containing Deadline.String() output
2020-12-01 00:14:36 +02:00
Aliaksandr Valialkin
1dce37b2fa app/vmbackup/snapshot: add missing status code check for the returned response when working with snapshot API
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/929
2020-11-30 14:49:29 +02:00
Roman Khavronenko
d5ba66c303 Cluster dashboard (#931)
* dashboard: add `Storage full ETA` panel

Backport of https://github.com/VictoriaMetrics/VictoriaMetrics/pull/858

* dashboard: add `Storage reachability` panel
2020-11-30 11:30:11 +02:00
Aliaksandr Valialkin
8b5a38376d app/vmbackup/snapshot: log url and response body on failed JSON response parsing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/929
2020-11-29 12:16:08 +02:00
Nikolay
e4e33cb757 fixes checksum calculation (#928)
* fixes checksum calculation,
'for' rule param wasnt marshal properly during checksum calculation

* fixes error
2020-11-29 09:50:57 +02:00
Aliaksandr Valialkin
9eca96596f lib/storage: add missing (AccountID, ProjectID) in MetricName.String() test 2020-11-29 01:25:50 +02:00
Aliaksandr Valialkin
2385ac11c0 lib/promscrape: fix failing tests after a906b3862f 2020-11-29 01:25:49 +02:00
Aliaksandr Valialkin
2ed721e457 lib/protoparser/prometheus: properly parse OpenMetrics timestamps
OpenMetrics timestamps are floating-point numbers, that represent Unix timestamp in seconds.
This differs from Prometheus exposition format, where timestamps are integer numbers representing Unix timestamp in milliseconds.
2020-11-27 14:54:36 +02:00
Aliaksandr Valialkin
2c0b1d5454 sync with README.md from single-node version 2020-11-27 13:23:16 +02:00
Aliaksandr Valialkin
6bd9fe4e77 docs/Articles.md: add https://valyala.medium.com/prometheus-vs-victoriametrics-benchmark-on-node-exporter-metrics-4ca29c75590f 2020-11-27 10:25:00 +02:00
Aliaksandr Valialkin
3bb9bf33d6 lib/promscrape: reduce memory allocations when unpacking gzipped responses received from scrape targets 2020-11-26 18:32:16 +02:00
Aliaksandr Valialkin
af667c59c1 all: typo fix: thouthand->thousand 2020-11-26 13:34:05 +02:00
Aliaksandr Valialkin
3de102bcf1 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.8 to v1.0.9 2020-11-26 13:27:23 +02:00
Aliaksandr Valialkin
3dd2282ed9 lib/promscrape: release http response non-200 status code 2020-11-26 13:25:25 +02:00
Aliaksandr Valialkin
e6447e7588 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.7 to v1.0.8 2020-11-26 12:13:09 +02:00
Aliaksandr Valialkin
81fadba0b2 docs/CHANGELOG.md: cut v1.48.0 release 2020-11-26 02:06:11 +02:00
Aliaksandr Valialkin
b542df9ab5 docs/CHANGELOG.md: add a link to Netflix Eureka - https://github.com/Netflix/eureka 2020-11-26 01:36:30 +02:00
Aliaksandr Valialkin
3f52e59efe app/{vmagent,victoria-metrics}: add -dryRun option and make more clear handling for -promscrape.config.dryRun 2020-11-25 23:01:39 +02:00
Aliaksandr Valialkin
ed06990609 app/vmagent: do not enable -promscrape.config.strictParse when -dryRun command-line flag is set
Users can specify -promscrape.config.strictParse if -promscrape.config shouldn't contain unknown config entries
2020-11-25 22:27:41 +02:00
Aliaksandr Valialkin
8f8727cb65 lib/mergeset: tune the number of rawItemsBlocks to merge at once
512 blocks give higher ingestion performance and slightly lower memory usage
2020-11-25 21:53:15 +02:00
Aliaksandr Valialkin
8fcd87a6a5 lib/mergeset: help GC by removing refereces to slices in inmemoryBlock.Reset 2020-11-25 21:20:02 +02:00
Aliaksandr Valialkin
03002f1fe1 lib/storage: log metric name plus all its labels when the metric timestamp is outside the configured retention
This should simplify debugging when the source of the metric with unexpected timestamp must be found.
2020-11-25 14:44:29 +02:00
Aliaksandr Valialkin
4848a05924 lib/storage: typo fix in error message: allowd->allowed 2020-11-25 14:15:54 +02:00
Aliaksandr Valialkin
26e699c440 lib/protoparser/prometheus: properly parse "infinity" values in OpenMetrics format
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/924
2020-11-24 19:02:50 +02:00
Aliaksandr Valialkin
a8b5a6e517 lib/logger: disable rate limiting for error and warn logs by default 2020-11-24 12:42:07 +02:00
Aliaksandr Valialkin
7f3e884a31 all: spelling fix: superflouos->superfluous. This is a follow-up for 0acdab3ab9 2020-11-24 12:42:04 +02:00
BigFish
3159b41689 Update main.go (#922)
fix spelling mistake
2020-11-24 12:36:47 +02:00
Aliaksandr Valialkin
284d805895 docs/CHANGELOG.md: mention that /tags/delSeries handler is supported after f0c207fae2 2020-11-24 12:30:36 +02:00
Aliaksandr Valialkin
dad8b76a0e lib/protoparser/prometheus: properly parse metrics with exemplars
Examplars have been introduced in OpenMetrics - see https://github.com/OpenObservability/OpenMetrics/blob/master/OpenMetrics.md#exemplars-1
Previously VictoriaMetrics couldn't parse the following metric

    foo{bar="baz"} 123 # exemplar here

This commit fixes this. Note that VictoriaMetrics ignores the exemplar as for now.
2020-11-24 12:30:34 +02:00
Aliaksandr Valialkin
768fd8c3d9 docs/Articles.md: add recent articles about VictoriaMetrics 2020-11-24 12:30:30 +02:00
Aliaksandr Valialkin
2cc288c023 app/vmbackup: cosmetic fixes 2020-11-23 17:10:13 +02:00
Aliaksandr Valialkin
8b82f9d8b8 lib/promscrape: expose __meta_ec2_ipv6_addresses label for ec2_sd_config like Prometheus will do in the next release 2020-11-23 16:57:03 +02:00
Aliaksandr Valialkin
c2186279b7 lib/promscrape: add filters option to dockerswarm_sd_config like Prometheus did in v2.23.0 2020-11-23 16:27:33 +02:00
Aliaksandr Valialkin
e1297c0b78 app/vmselect: add /tags/delSeries handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb
2020-11-23 15:32:14 +02:00
Aliaksandr Valialkin
3d2ce31cad app/vmselect/netstorage: code readability improvement: rename *RequestErrors to *Errors 2020-11-23 15:00:15 +02:00
Aliaksandr Valialkin
433ae806ac app/vmselect: implement /tags/tagSeries and /tags/tagMultiSeries` in order to be consistent with single-node VictoriaMetrics 2020-11-23 14:57:08 +02:00
Aliaksandr Valialkin
7987129baa app/vmselect/netstorage: move common code for requests execution on all the storage nodes to startStorageNodesRequest func 2020-11-23 10:51:48 +02:00
Aliaksandr Valialkin
25a57ced6c app/vmselect/netstorage: prevent from data races in ProcessSearchQuery and in Export funcs when -replicationFactor > 1
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-11-23 10:25:51 +02:00
Aliaksandr Valialkin
f4fd917e4f lib/fs: replace fs.OpenReaderAt with fs.MustOpenReaderAt
All the callers for fs.OpenReaderAt expect that the file will be opened.
So it is better to log fatal error inside fs.MustOpenReaderAt instead of leaving this to the caller.
2020-11-23 09:57:30 +02:00
Aliaksandr Valialkin
1dcb438c3b app/vmselect/netstorage: typo fix after 990eb29a9b 2020-11-23 01:09:43 +02:00
Aliaksandr Valialkin
85eecf5801 app/vmselect/netstorage: add -replicationFactor command-line flag for reducing query duration when a part of vmstorage nodes are temporarily slow and/or temporarily unavailable
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-11-23 00:39:53 +02:00
Aliaksandr Valialkin
990eb29a9b app/vmselect/netstorage: move common code for collecting query results from vmstorage nodes to collectResults function 2020-11-23 00:16:02 +02:00
Aliaksandr Valialkin
d892d63204 lib/promscrape: hint that -enableTCP6 command-line flag can be used for connecting to IPv6 addresses 2020-11-21 14:39:05 +02:00
Aliaksandr Valialkin
8608e956dd lib/promscrape/discovery/eureka: follow-up after eec76718e9 2020-11-20 14:02:14 +02:00
Nikolay
bb2bcb9725 Adds eureka service discovery (#913)
* Adds eureka service discovery
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/851
Netflix service discovery for AWS

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-20 14:02:13 +02:00
John Belmonte
7ac49ac176 MetricsQL docs: parameter consistency (#915)
* MetricsQL docs: parameter consistency

if I understand correctly:
  * `fun(q)` - fun takes instant vector
  * `fun(m[d])` - fun takes range vector

* Update docs/MetricsQL.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-20 11:43:07 +02:00
Aliaksandr Valialkin
59aa57efbc vendor: make vendor-update 2020-11-19 19:21:23 +02:00
Aliaksandr Valialkin
5e39bedf40 docs/CHANGELOG.md: mention that slow query log now contains remote client address 2020-11-19 12:41:26 +02:00
Aliaksandr Valialkin
8e1f657ef9 lib/logger: follow-up for 09105ff49c 2020-11-19 12:37:05 +02:00
Nikolay
f54a5f3868 Adds log suppression per caller (#908)
* Adds log suppression per caller
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/905

* fixes style and report message
2020-11-19 12:19:20 +02:00
Aliaksandr Valialkin
e72ccc9239 app/vmselect: add remoteAddr to slow query log in order to improve debuggability
This will simplify identifying the client that sends slow queries to VictoriaMetrics.
2020-11-18 20:40:02 +02:00
Aliaksandr Valialkin
2af08d6e97 docs/CHANGELOG.md: mention about snap install victoriametrics 2020-11-18 19:50:38 +02:00
Aliaksandr Valialkin
5331349efb docs: make snap install victoriametrics more prominent in docs 2020-11-18 19:45:10 +02:00
Aliaksandr Valialkin
f372148203 docs/Single-server-VictoriaMetrics.md: an attempt to fix markdown formatting in Graphite Tags API section 2020-11-18 14:41:48 +02:00
Aliaksandr Valialkin
e7976363f9 docs: lowercase adidas trademark according to their request 2020-11-18 13:47:56 +02:00
Aliaksandr Valialkin
a5413aa438 docs/Cluster-VictoriaMetrics.md: adjust RAM sizing recommendations for vmstorage nodes
It is recommended to have at least of 50% of free RAM on vmstorage nodes in order handle possible
RAM usage spikes during rolling upgrade for vmstorage nodes when time series
are re-routed from temporarily unavailable node to the remaining active nodes.
2020-11-18 13:02:38 +02:00
Aliaksandr Valialkin
a0a34e2a26 docs/Single-server-VictoriaMetrics.md: make consistent section title sizes 2020-11-18 12:36:22 +02:00
Aliaksandr Valialkin
0895b7f411 lib/logger: add -loggerWarnsPerSecondLimit command-line flag for rate limiting of WARN log messages
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/905
2020-11-18 03:43:17 +02:00
Aliaksandr Valialkin
bf9b6b77c8 docs/Single-server-VictoriaMetrics.md: mention that /internal/force_flush endpoint is mostly needed for testing and debugging 2020-11-18 01:37:39 +02:00
Aliaksandr Valialkin
ea4afb201b app/vmselect/netstorage: typo fix in a comment inside SearchMetricNames func 2020-11-18 01:35:37 +02:00
Aliaksandr Valialkin
c6adcafedb app/vminsert: export vm_rpc_vmstorage_is_reachable metric, which can be used for monitoring reachability of vmstorage nodes from vminsert nodes 2020-11-17 22:13:26 +02:00
Aliaksandr Valialkin
a00df790b1 docs/CHANGELOG.md: cut v1.47.0 release 2020-11-16 21:00:21 +02:00
Aliaksandr Valialkin
d29997dab6 vendor: make vendor-update 2020-11-16 20:52:18 +02:00
Aliaksandr Valialkin
e8ef94db4b Makefile: add -d flag to go get in vendor-update target
This should skip unnecessary build step for the updated packages
2020-11-16 20:52:15 +02:00
Aliaksandr Valialkin
5454137504 docs/Cluster-VictoriaMetrics.md: make docs-sync after 57dc152e9d 2020-11-16 20:20:19 +02:00
Nikita Babushkin
57dc152e9d fix typo in readme.md (#900) 2020-11-16 20:18:25 +02:00
Aliaksandr Valialkin
7d76fdedcc app/vmselect: use storage.NewSearchQuery() instead of constructing storage.SearchQuery in-place
This should prevent from bugs when AccountID and ProjectID aren't set in storage.SearchQuery.
2020-11-16 18:04:33 +02:00
Aliaksandr Valialkin
a9287cf564 lib/storage: do not pass (accountID, projectID) to SearchTagNames(), since they are already passed via tfss 2020-11-16 18:04:30 +02:00
Aliaksandr Valialkin
911c6d3bcd app/vmselect: add missing graphite prefix to /tags/autoComplete/{tags,values} 2020-11-16 18:04:24 +02:00
Aliaksandr Valialkin
f7f866d83b app/vmselect/netstorage: typo fix 2020-11-16 15:54:45 +02:00
Aliaksandr Valialkin
59fb75717e app/vmselect/netstorage: apply Graphite filter after substituting __name__ with name 2020-11-16 15:50:53 +02:00
Aliaksandr Valialkin
6bcbdb18fb docs/CHANGELOG.md: mention about Graphite Tags API implementation 2020-11-16 15:34:56 +02:00
Aliaksandr Valialkin
eb763bcb9d app/vmselect/graphite: add /tags/autoComplete/values handler from Graphite Tags API 2020-11-16 15:29:29 +02:00
Aliaksandr Valialkin
f2f16d8e79 app/vmselect/graphite: add /tags/autoComplete/tags handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support
2020-11-16 14:58:10 +02:00
Aliaksandr Valialkin
2f4421b86c app/vmselect/prometheus: return __name__ label if match[] query to /api/v1/labels matches at least a single time series 2020-11-16 13:54:50 +02:00
Aliaksandr Valialkin
852aed62f7 app/vmselect/prometheus: improve performance for /api/v1/labels and /api/v1/label/<labelName>/values on time ranges exceeding one day when match[] query arg is set 2020-11-16 13:46:51 +02:00
Aliaksandr Valialkin
e969346e3e app/vmselect/prometheus: fix deadlock in /api/v1/series on a time range exceeding one day 2020-11-16 13:30:57 +02:00
Aliaksandr Valialkin
ac7460abdd lib/storage: add a test for Storage.SearchMetricNames 2020-11-16 13:18:48 +02:00
Aliaksandr Valialkin
467ed68a37 docs/CHANGELOG.md: mention about improved performance for /api/v1/series on a time range exceeding one day 2020-11-16 12:52:33 +02:00
Aliaksandr Valialkin
eea1be0d5c app/vmselect/graphite: add /tags/findSeries handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags
2020-11-16 12:52:23 +02:00
Aliaksandr Valialkin
97100b1d42 app/vmselect/graphite: apply filter then limit 2020-11-16 12:52:18 +02:00
Aliaksandr Valialkin
5889273920 app/vmselect/graphite: add /tags/<tag_name> handler for Graphite Tags API 2020-11-16 03:41:41 +02:00
Aliaksandr Valialkin
99cb1a70cf app/vmselect/graphite: add /tags handler from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags
2020-11-16 02:57:20 +02:00
Aliaksandr Valialkin
4be5b5733a app/vminsert: add /tags/tagSeries and /tags/tagMultiSeries handlers from Graphite Tags API
See https://graphite.readthedocs.io/en/stable/tags.html#adding-series-to-the-tagdb
2020-11-16 02:40:04 +02:00
Aliaksandr Valialkin
9ec964bff8 lib/storage: do not show artifically created label for reverse Graphite labels at /api/v1/labels page 2020-11-16 00:44:54 +02:00
Aliaksandr Valialkin
2ac5f00d98 app/vmselect: propagate errors from vmstorage to response to the client if -search.denyPartialResponse command-line flag is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/891

This commit also adds `"isPartial":{true|false}` field to `/api/v1/*` responses. `"isPartial":true` is set when the response
is based on a partial data because some of vmstorage nodes weren't available during query processing.
2020-11-14 13:20:10 +02:00
Aliaksandr Valialkin
882e2e2099 app/vminsert/netstorage: return 503 status code to client when all the vmstorage nodes are unavailable
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
2020-11-14 00:44:41 +02:00
Aliaksandr Valialkin
c329aacedf docs/Single-server-VictoriaMetrics.md: document /internal/force_flush endpoint 2020-11-13 18:43:40 +02:00
Aliaksandr Valialkin
f19547039a docs/Single-server-VictoriaMetrics.md: explain why recently inserted data may be unavailable for querying for a few seconds 2020-11-13 18:34:31 +02:00
Aliaksandr Valialkin
f80d6473e1 lib/protoparser/promremotewrite: log the time spent on unsuccessful data read from the network
This should help with debugging `connection timed out` errors.
2020-11-13 17:49:21 +02:00
Aliaksandr Valialkin
1364dfdd8c docs/CHANGELOG.md: mentioned that Go builder has been updated from v1.15.4 to v1.15.5
See 3fa9ab4a49 for details.
2020-11-13 16:22:18 +02:00
Vasily
8ba168f3be Add omitempty for DisableCompression and DisableKeepAlive fields in ScrapeConfig (#796)
* Add omitempty for DisableCompression and DisableKeepAlive fields in ScrapeConfig

* Add omitempty annotation to all the default/optional values

* Fix annotations after review
2020-11-13 16:17:03 +02:00
Aliaksandr Valialkin
cc4da051f3 docs/Single-server-VictoriaMetrics.md: sync with single-node README.md 2020-11-13 16:03:50 +02:00
Aliaksandr Valialkin
8f42e59e05 app/vmselect/promql: remove spikes from increase() and delta() results on time series with spare irregular data points
Do not take into account spare data point value if the next point will is located too far from the current point.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/894
2020-11-13 15:23:37 +02:00
Aliaksandr Valialkin
da6d82a8dd app/vmselect/promql: assume that time series value doesnt change during gaps when calculating increase() and delta()
This should remove unexpected spikes at the end of gaps.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/894
2020-11-13 14:59:32 +02:00
Aliaksandr Valialkin
d21b1606a1 lib/protoparser/opentsdbhttp: increment errors counter on unmarshal errors
This is a follow-up for 149c0c4a6d
2020-11-13 13:23:11 +02:00
Aliaksandr Valialkin
4d9501a0c4 vendor: make vendor-update 2020-11-13 13:08:41 +02:00
Aliaksandr Valialkin
22c1e29284 lib/protoparser: propagate callback error to the caller of ParseStream for every supported data ingestion protocols
The caller of ParseStream then can generate HTTP 503 responses for non-nil errors occured in callbacks when processing incoming requests.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
2020-11-13 13:05:34 +02:00
Aliaksandr Valialkin
9dfe00c962 lib/protoparser/promremotewrite: synchronously process Prometheus remote_write requests
There is no reason in processing these requests asynchronously in the face of https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
Synchronous processing code is easier to read and understand than the previous async code
2020-11-13 12:17:32 +02:00
Aliaksandr Valialkin
739b88c1e4 lib/protoparser/promremotewrite: forward errors, which can occur during data ingestion, to the caller of ParseStream, so it could properly return HTTP 503 status code on non-nil error
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/896
2020-11-13 11:00:41 +02:00
Aliaksandr Valialkin
13b547f218 deployment/docker: update Go builder from v1.15.4 to v1.15.5
This fixes the following possible issues in Go - https://github.com/golang/go/issues?q=milestone%3AGo1.15.5+label%3ACherryPickApproved
2020-11-13 11:00:40 +02:00
Aliaksandr Valialkin
7ceaf4ba8f all: consistently return text-based HTTP responses with charset=utf-8
This is a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/897
2020-11-13 10:30:21 +02:00
faceair
64e99744f1 add charset on targets response (#897) 2020-11-13 10:18:13 +02:00
Aliaksandr Valialkin
f7a6ae3d11 docs/vmagent.md: added a link to https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2 into Relabeling section 2020-11-12 12:27:13 +02:00
Aliaksandr Valialkin
069979c367 docs/vmagent.md: typo fix 2020-11-11 16:05:04 +02:00
Aliaksandr Valialkin
7a0094adae docs/vmagent.md: add Configuration update section 2020-11-11 16:01:21 +02:00
Aliaksandr Valialkin
94842ed942 docs/Single-server-VictoriaMetrics.md: document -search.treatDotsAsIsInRegexps command-line option 2020-11-11 14:59:38 +02:00
immerrr again
1ec1a9f27f app/vmstorage: add "/internal/force_flush" endpoint (#893) 2020-11-11 14:46:37 +02:00
Aliaksandr Valialkin
6b979ea5a7 docs/Single-server-VictoriaMetrics.md: small clarifications in VictoriaMetrics features 2020-11-11 13:48:16 +02:00
Aliaksandr Valialkin
20d0ead2f2 docs/Single-server-VictoriaMetrics.md: update the link to enterprise features 2020-11-11 13:42:42 +02:00
Aliaksandr Valialkin
779f591bcc docs/Single-server-VictoriaMetrics.md: mention that /api/v1/status/tsdb handler accepts topN and date query args 2020-11-11 13:38:26 +02:00
Aliaksandr Valialkin
8033e4885b docs/Cluster-VictoriaMetrics.md: mention about optional topN and date query args for /api/v1/status/tsdb handler 2020-11-11 13:35:13 +02:00
Aliaksandr Valialkin
4f2c5877db app/vmselect: add -search.treatDotsAsIsInRegexps command-line flag for automatic escaping of dots in regexp label filters 2020-11-11 12:40:28 +02:00
Aliaksandr Valialkin
0769f86a7e docs/Single-server-VictoriaMetrics.md: clarify which directories can be removed when recovering from data corruption 2020-11-11 12:40:28 +02:00
Aliaksandr Valialkin
346181fd48 docs/Single-server-VictoriaMetrics.md: add a hint that case studies can be read by clicking on the corresponding link 2020-11-11 12:40:28 +02:00
Aliaksandr Valialkin
a78bf34ff3 app/vmselect: do not return isPartialResponse=true when all the storageNodes return errors 2020-11-10 18:48:57 +02:00
Aliaksandr Valialkin
697fd44158 lib/promscrape: make a copy of ScrapeWork from discovered []ScrapeWork slice instead of referring to an item in this slice
This should prevent from holding previously discovered []ScrapeWork slices when a part of discovered targets changes over time.
This should reduce memory usage for the case when big number of discovered scrape targets changes over time.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-10 16:13:31 +02:00
Aliaksandr Valialkin
2ec02b7bdb lib/promscrape: pre-allocate slice for discovered targets based on previously discovered targets
This should reduce load on GC a bit when discovering big number of scrape targets
2020-11-10 15:57:43 +02:00
Aliaksandr Valialkin
8f3339fa81 app/vmselect/promql: do not return data points in the end of the selected time range for time series ending in the middle of the selected time range
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/887
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/845
2020-11-10 14:51:55 +02:00
Aliaksandr Valialkin
6385432611 app/vmselect: typo fix in a description for -search.minStalenessInterval: mimimum->minimum 2020-11-10 01:18:59 +02:00
Aliaksandr Valialkin
6ae9d79f6b docs/CHANGELOG.md: mention abot explicit setting of extra labels in alert entities (see 3adf8c5a6f)
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-11-10 00:41:31 +02:00
Aliaksandr Valialkin
ad82b6ead8 Move CHANGELOG.md to docs/CHANGELOG.md 2020-11-10 00:37:15 +02:00
Roman Khavronenko
4fd2b6cd16 vmalert: explicitly set extra labels to alert entities (#886)
The previous implementation treated extra labels (global and rule labels) as
separate label set to returned time series labels. Hence, time series always contained
only original labels and alert ID was generated from sorted labels key-values.
Extra labels didn't affect the generated ID and were applied on the following actions:
- templating for Summary and Annotations;
- persisting state via remote write;
- restoring state via remote read.

Such behaviour caused difficulties on restore procedure because extra labels had to be dropped
before checking the alert ID, but that not always worked. Consider the case when expression
returns the following time series `up{job="foo"}` and rule has extra label `job=bar`.
This would mean that restored alert ID will be always different to the real time series because
of collision.

To solve the situation extra labels are now always applied beforehand and `vmalert` doesn't
store original labels anymore. However, this could result into a new error situation.
Consider the case when expression returns two time series `up{job="foo"}` and `up{job="baz"}`,
while rule has extra label `job=bar`. In such case, applying extra labels will result into
two identical time series and `vmalert` will return error:
 `result contains metrics with the same labelset after applying rule labels`

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-11-10 00:27:56 +02:00
Aliaksandr Valialkin
a8562d643b lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-10 00:20:49 +02:00
Aliaksandr Valialkin
aa3e46a02d lib/promscrape: further reduce memory usage for per-scrape target labels by making a copy of actually used labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-09 10:55:03 +02:00
Aliaksandr Valialkin
96e9deecbc docs/Single-server-VictoriaMetrics.md: typo fix 2020-11-08 13:40:52 +02:00
Aliaksandr Valialkin
a8ad3091e3 CHANGELOG.md: cut v1.46.0 2020-11-07 17:54:10 +02:00
Aliaksandr Valialkin
d2296b7e09 docs/CHANGELOG.md: make docs-sync 2020-11-07 16:30:41 +02:00
Aliaksandr Valialkin
b8083b7659 lib/promscrape: clean references to label name and label value strings after applying per-target relabeling
This should reduce memory usage when per-target relabeling creates big number of temporary labels
with long names and/or values.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-07 16:19:52 +02:00
Aliaksandr Valialkin
b4efe626d7 lib/promscrape/discovery/kubernetes: go fmt 2020-11-07 13:04:09 +02:00
Aliaksandr Valialkin
92bc1afcee lib/promscrape/discovery/kubernetes: reduce memory usage for labels when discovering big number of scrape targets by using string concatenation instead of fmt.Sprintf
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825
2020-11-07 13:03:01 +02:00
Aliaksandr Valialkin
535fea3d11 lib/promscrape: eliminate data race in stream parse mode
Previously `-promscrape.streamParse` mode could result in garbage labels for the scraped metrics because of data race.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825#issuecomment-723198247
2020-11-07 12:45:52 +02:00
Aliaksandr Valialkin
6ca5a94359 docs/CHANGELOG.md: add CHANGELOG header 2020-11-07 01:14:49 +02:00
Aliaksandr Valialkin
4b57604657 deployment/docker: update Go builder from v1.15.3 to v1.15.4
This fixes issues found in Go - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.4+label%3ACherryPickApproved
2020-11-07 00:49:34 +02:00
Aliaksandr Valialkin
45958d16f6 vendor: make vendor-update 2020-11-06 13:57:06 +02:00
Aliaksandr Valialkin
6866e4ab25 deployment/docker: update root image from alpine:3.12 to alpine:3.12.1 2020-11-06 13:54:30 +02:00
Aliaksandr Valialkin
efebc3b6fb app/vmselect/promql: code cleanup after 43823addea 2020-11-06 01:31:33 +02:00
n4mine
3127aa92b5 app/vmselect/promql: fix when the parameter of maxValue(), minValue() leading by NaN. it will cause {top,bottom}k_{max,min} return inappropriate result (#883) 2020-11-06 01:31:31 +02:00
Aliaksandr Valialkin
8accbc14d8 docs/Articles.md: add a link to https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2 2020-11-05 19:18:13 +02:00
Aliaksandr Valialkin
6ce80425a6 docs/MetricsQL.md: fix a link to with templates page 2020-11-05 16:12:59 +02:00
Aliaksandr Valialkin
60747c5f14 docs/Articles.md: move third-party articles to the top 2020-11-05 15:03:50 +02:00
Aliaksandr Valialkin
bf28bc3792 docs: add CHANGELOG.md 2020-11-05 14:59:17 +02:00
Aliaksandr Valialkin
53193ca2bc CHANGELOG.md: add a link to issue related to /ready endpoint
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/875
2020-11-05 14:56:59 +02:00
Aliaksandr Valialkin
8a577568e3 CHANGELOG.md: mention about /ready endpoint in vmagent 2020-11-05 14:54:38 +02:00
Aliaksandr Valialkin
767231f41f app/vmstorage/transport: properly handle request to labelValuesOnTimeRange 2020-11-05 02:08:04 +02:00
Aliaksandr Valialkin
72011bcc45 app/vmselect: properly handle errors in GetLabelsOnTimeRange and GetLabelValuesOnTimeRange 2020-11-05 01:36:34 +02:00
Aliaksandr Valialkin
f2bff64933 lib/storage: remove data race when updating rowsDeleted 2020-11-05 01:19:30 +02:00
Aliaksandr Valialkin
c5e6c5f5a6 app/vmselect: optimize querying for /api/v1/labels and /api/v1/label/<name>/values when start and end args are set 2020-11-05 01:19:29 +02:00
Aliaksandr Valialkin
1336e47c86 docs/vmagent.md: update after 4c808d58bf 2020-11-04 20:33:49 +02:00
Nikolay
5b235b902b Adds ready probe (#874)
* adds leading forward slash check for scrapeURL path
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/835

* adds ready probe for scrape config initialization,
it should prevent metrics loss during vmagent rolling update,
/ready api will return 425 http code, if some scrape config still waits for initialization.

* updates docs

* Update app/vmagent/README.md

* renames var

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-04 20:33:48 +02:00
Aliaksandr Valialkin
d8a7186019 docs/CaseStudies.md: add case study for Idealo.de 2020-11-04 20:19:26 +02:00
Aliaksandr Valialkin
2cd86d0220 lib/promscrape: docs update after e4182dd896
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 17:13:34 +02:00
Nikolay
d0a9b24c5a reduces memory usage for vmagent, (#880)
* reduces memory usage for vmagent,
limits count of droppedTarget, that can be stored for /api/v1/targets page up to 999 items,
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878

* Update app/vmagent/README.md

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-11-04 17:13:33 +02:00
Aliaksandr Valialkin
692f5d7bca app/vmselect: reduce memory usage when query touches big number of time series 2020-11-04 16:48:10 +02:00
Aliaksandr Valialkin
c736339843 lib/{storage,mergeset}: clean cached index blocks and inmemory blocks more aggressively
Previously such blocks were cleaned after they weren't accessed during 10 minutes.
Now they are cleaned after one minute of missing access. This should reduce memory usage in general case.
2020-11-04 16:44:15 +02:00
Aliaksandr Valialkin
f35aafb6a5 Revert "lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets"
This reverts commit b08c6f5144.
2020-11-04 11:45:38 +02:00
Aliaksandr Valialkin
407a46c11e Revert "docs/vmagent.md: mention about -promscrape.dropOriginalLabels"
This reverts commit 1a80acc712.
2020-11-04 11:45:35 +02:00
Aliaksandr Valialkin
1a80acc712 docs/vmagent.md: mention about -promscrape.dropOriginalLabels 2020-11-04 11:16:16 +02:00
Aliaksandr Valialkin
b08c6f5144 lib/promscrape: add -promscrape.dropOriginalLabels command-line flag for reducing memory usage when discovering big number of scrape targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 11:09:05 +02:00
Aliaksandr Valialkin
c046735571 lib/promscrape: reduce memory allocations in promLabelsString() function
This should help with reducing memory usage in https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878
2020-11-04 10:38:59 +02:00
Aliaksandr Valialkin
c0bd208c77 lib/storage: do not report about the need of free disk space if parts cannot be merged due to too big write amplification 2020-11-03 15:32:09 +02:00
Aliaksandr Valialkin
887a3c317f app/vmagent/remotewrite: drop packets only on 409 status code, since there are other valid 4xx status codes, which shouldnt result in packet drop 2020-11-03 14:24:57 +02:00
Aliaksandr Valialkin
1b9778a756 lib/storage: remove unneeded fmt.Sprintf 2020-11-03 14:21:04 +02:00
John Belmonte
8653e2658e add short_version label to vm_app_version metric (#877)
* add short_version label to vm_app_version metric

use case:  Version panel of Grafana dashboard should use a live query, but currently it uses a template query which becomes stale.  Grafana is not able to preform regex substitution on labels.

* Update metrics.go

* fix compile
2020-11-03 14:12:42 +02:00
Aliaksandr Valialkin
40a4f5ded4 docs/MetricsQL.md: add missing whitespace 2020-11-02 23:49:45 +02:00
Aliaksandr Valialkin
dbd1789479 vendor: make vendor-update 2020-11-02 22:04:35 +02:00
Aliaksandr Valialkin
f3a7e6f6e3 lib/storage: remove obsolete code 2020-11-02 19:17:30 +02:00
Aliaksandr Valialkin
9715d9a3a8 CHANGELOG.md: cut v1.45.0 2020-11-02 04:00:41 +02:00
Aliaksandr Valialkin
9df946517c vendor: make vendor-update 2020-11-02 02:40:17 +02:00
Aliaksandr Valialkin
c76242d52d CHANGELOG.md: add a link to https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825 2020-11-02 02:29:11 +02:00
Aliaksandr Valialkin
66de02fbb4 app/vmselect/promql: allow dropping trailing sample only for default_rollup function
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/850
2020-11-02 02:11:06 +02:00
Aliaksandr Valialkin
3ed9f1d5a9 lib/promscrape: properly handle response body after 301 redirect
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/869
2020-11-02 01:09:59 +02:00
Aliaksandr Valialkin
bc7db0bf4f CHANGELOG.md: mention about packets drop in vmagent like Prometheus does 2020-11-02 00:47:05 +02:00
Aliaksandr Valialkin
ca2e0f1e04 app/vmagent/remotewrite: drop packets if remote storage returns 4xx status code
This makes consistent the behaviour with Prometheus.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
2020-11-02 00:45:01 +02:00
Aliaksandr Valialkin
6b623eba02 app/vmselect/promql: go fmt 2020-11-02 00:18:24 +02:00
Aliaksandr Valialkin
7c0b658865 app/vmselect/promql: do not drop trailing datapoints for instant queries
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/845
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-11-02 00:12:53 +02:00
Aliaksandr Valialkin
ebe2782dcf CHANGELOG.md: mention about recently added bugfixes 2020-11-01 23:33:50 +02:00
Roman Khavronenko
333675875f vmalert: skip automatically added labels on alerts restore (#871)
Label `alertgroup` was introduced in #611 and automatically added to generated
time series. By mistake, this new label wasn't correctly purged on restore event
and affected alert's ID uniqueness. This commit removes `alertgroup` label
in restore function.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/870
2020-11-01 23:26:00 +02:00
Nikolay
e8fe618bbb fixes panic at scrape error body formating, (#868)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/864
regression after body reuse improvements
2020-11-01 23:25:23 +02:00
Nikolay
058f49de57 adds leading forward slash check for scrapeURL path (#855)
* fixes in-consistency with prometheus behaviour for scrape targets url path.
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/835
2020-11-01 23:22:16 +02:00
kreedom
40172c0721 vmbackup fix panic when no origin fs given (#859)
* use fsnil when no origin fs
2020-11-01 23:17:01 +02:00
Aliaksandr Valialkin
ed724d25ba lib/promscrape: add stream parse mode for efficient scraping of targets that expose millions of metrics 2020-11-01 23:12:26 +02:00
Aliaksandr Valialkin
901514be88 lib/storage: drop more samples outside the given retention during background merge
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/17
2020-10-31 20:44:47 +02:00
Aliaksandr Valialkin
abdf22e0bb app/vmagent: expose /api/v1/targets page according to https://prometheus.io/docs/prometheus/latest/querying/api/#targets
This page is exposed by vmagent and by a single-node VictoriaMetrics

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/643
2020-10-20 21:55:14 +03:00
Aliaksandr Valialkin
c4464594b7 app/vmselect/promql: allow passing optional third argument to topk_* and bottomk_* functions in order to obtain sum of time series outside top/bottom K 2020-10-20 20:09:55 +03:00
Aliaksandr Valialkin
7599e5c835 lib/storage: properly handle the case when key="__name__" is passed to MetricName.AddTag* 2020-10-20 20:09:52 +03:00
Aliaksandr Valialkin
9c5cd5a6c5 lib/storage: code cleanup after 5bfd4e6218 2020-10-20 16:10:53 +03:00
Aliaksandr Valialkin
0db7c2b500 app/vmstorage: support for -retentionPeriod smaller than one month
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/173
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/17
2020-10-20 14:42:46 +03:00
Aliaksandr Valialkin
b915bf01e4 CHANGELOG.md: mention about recently added changes 2020-10-20 14:42:43 +03:00
Aliaksandr Valialkin
5518b11720 lib/memory: do not print trailing zeroes in logs for -memory.allowedPercent command-line flag 2020-10-20 14:42:37 +03:00
Aliaksandr Valialkin
abdf020e22 docs/vmrestore.md: make docs-sync 2020-10-20 10:49:22 +03:00
kreedom
4526cf92d3 vmalert - add dryRun (#842)
vmalert: add `dryRun` flag for rules validation without running the service
2020-10-20 10:49:22 +03:00
Seva Poliakov
e6bf9eaac7 Fix typo in vnrestore readme 2020-10-20 10:49:22 +03:00
faceair
0093ee3cd9 disable response compression on websocket (#841) 2020-10-17 13:33:37 +03:00
Aliaksandr Valialkin
9b91305a31 docs/MetricsQL.md: small clarifications 2020-10-17 12:01:33 +03:00
Aliaksandr Valialkin
ee2902ddaf app/vmselect/promql: an attempt to improve heuristics for dropping trailing data points in time series
Now trailing data points are additionally dropped for time series with a single raw sample

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-10-17 10:44:26 +03:00
Aliaksandr Valialkin
efb1989193 lib/storage: small code adjustements after d2960a20e0
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/781
2020-10-17 01:17:12 +03:00
faceair
8ddf089deb evaluate the execution cost of all tag filters (#824)
* evaluate the execution cost of all tag filters

* fix suffixes typo
2020-10-17 01:13:20 +03:00
Aliaksandr Valialkin
35791d9b29 CHANGELOG.md: mention about improved openstack endpoint handling
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728
2020-10-16 23:06:23 +03:00
Nikolay Khramchikhin
a5d842caf8 fixes openstack api endpoint with suffix trim adds openstack (#840)
api v2.0 check

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728
2020-10-16 23:01:45 +03:00
Aliaksandr Valialkin
8d87b57fbf deployment/docker: update Go builder from Go1.15.2 to Go1.15.3
This should fix potential issues related to Go runtime - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.3+label%3ACherryPickApproved
2020-10-16 15:09:30 +03:00
Aliaksandr Valialkin
cde26141c0 vendor: make vendor-update 2020-10-16 15:05:54 +03:00
Aliaksandr Valialkin
da48a5a65c CHANGELOG.md: describe added optimization cases from 96cdfcba50 2020-10-16 12:59:48 +03:00
Aliaksandr Valialkin
8c027ba8a4 vendor: update github.com/VictoriaMetrics/metricsql from v0.7.1 to v0.7.2
The new release of github.com/VictoriaMetrics/metricsql adds more optimizations for `foo{filters1} op bar{filters2}`:

* rollup_func(foo[d]) op bar{filters}
* transform_func(foo) op bar{filters}
* num_or_scalar op bar op baz{filters}
2020-10-16 12:55:27 +03:00
Aliaksandr Valialkin
6227b71a67 docs: add a link to https://smarketshq.com/monitoring-kubernetes-clusters-41a4b24c19e3 article about VictoriaMetrics 2020-10-16 09:07:55 +03:00
Aliaksandr Valialkin
bc446ec62a docs/Single-server-VictoriaMetrics.md: update docs 2020-10-14 13:27:01 +03:00
Aliaksandr Valialkin
1634415441 docs/CaseStudies.md: actualize numbers for Wix.com 2020-10-14 13:07:41 +03:00
Aliaksandr Valialkin
e9fec0e5b8 docs/vmalert.md: make docs-sync 2020-10-13 18:35:00 +03:00
Roman Khavronenko
d6155a3f33 vmalert: update docs to highlight the state restore requirements; (#833)
Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/830
2020-10-13 18:34:00 +03:00
Aliaksandr Valialkin
4727bad124 CHANGELOG.md: cut v1.44.0 release 2020-10-13 16:59:47 +03:00
Aliaksandr Valialkin
de9d3f1332 docs/Cluster-VictoriaMetrics.md: clarify RAM requirements for vmstorage nodes 2020-10-13 16:47:25 +03:00
Aliaksandr Valialkin
b9a4601c97 app/vmselect/promql: return a single time series at max from absent() function like Prometheus does 2020-10-13 15:56:10 +03:00
Aliaksandr Valialkin
217c192c88 app/vmselect/promql: improve time series staleness detection
This should prevent from double counting for time series at the time when it changes label.
The most common case is in K8S, which changes pod uid label with each new deployment.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/748
2020-10-13 12:20:08 +03:00
Aliaksandr Valialkin
f877e703c8 app/vmselect/promql: fix mode_over_time calculations
Previously `mode_over_time` could return garbage due to improper shuffling of input data points.
2020-10-13 11:58:30 +03:00
Aliaksandr Valialkin
d884ab13dc app/vmselect/prometheus: fix golangci-lint warning 2020-10-13 09:36:18 +03:00
Aliaksandr Valialkin
0867dea5fc app/vmselect: add ability to export data in CSV format via /api/v1/export/csv 2020-10-12 20:08:08 +03:00
Aliaksandr Valialkin
6105756b26 CHANGELOG.md: mention about added Docker Swarm service discovery 2020-10-12 16:17:42 +03:00
Aliaksandr Valialkin
938b3b7ed1 lib/promscrape: code prettifying after 9bd9f67718 2020-10-12 16:13:59 +03:00
Nikolay Khramchikhin
7f96712b38 Adds dockerswarm sd (#818)
* adds dockerswarm service discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/656

 Following roles supported: services, tasks and nodes.
 Basic, token and tls auth supported.
 Added tests for labels generation.

* added unix socket support to discovery utils

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-10-12 16:13:58 +03:00
Aliaksandr Valialkin
ac525462ce docs/MetricsQL.md: mention that VictoriaMetrics keeps metric names after applying functions which dont change time series meaning 2020-10-12 13:25:28 +03:00
Aliaksandr Valialkin
ba4bfc2bdf CHANGELOG.md: mention that VictoriaMetrics keeps metric names when applying functions which don't change time series meaning 2020-10-12 13:02:47 +03:00
Aliaksandr Valialkin
2d03d0e2dd app/vmselect/promql: keep metric name after applying more functions, which dont change time series meaning
Functions are:

* keep_last_value
* keep_next_value
* interpolate
* running_min
* running_max
* running_avg
* range_min
* range_max
* range_avg
* range_first
* range_last
* range_quantile
* smooth_exponential

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:48:38 +03:00
Aliaksandr Valialkin
3881c84afe Revert "app/vmselect/promql: remove metric name after applying ceil, floor and round functions in order to be more consistent with Prometheus"
This reverts commit ac45082216.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric names:

* ceil
* floor
* round

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:48:38 +03:00
Aliaksandr Valialkin
79d70480b7 Revert "app/vmselect/promql: remove metric name after applying clamp_min and clamp_max functions in order to be consistent with Prometheus"
This reverts commit bb61a4769b.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric name:

* clamp_min
* clamp_max

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:48:38 +03:00
Aliaksandr Valialkin
8c37b63ea9 Revert "app/vmselect/promql: remove metric name from results of certain rollup functions in order to be consistent with Prometheus"
This reverts commit e5202a4eae.

Reason for revert: the previous behavior for VictoriaMetrics is easier to understand and use by users -
functions, which don't change the meaning of the time series shouldn't drop metric name.

Now the following functions do not drop metric name:

* max_over_time
* min_over_time
* avg_over_time
* quantile_over_time
* geomean_over_time
* mode_over_time
* holt_winters
* predict_linear

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/674
2020-10-12 11:48:38 +03:00
Aliaksandr Valialkin
85d507f71d docs/Single-server-VictoriaMetrics.md: add missing whitespace 2020-10-09 20:56:45 +03:00
Aliaksandr Valialkin
de1c07b937 lib/backup: add MustStop() method for all remote filesystems 2020-10-09 15:32:13 +03:00
Aliaksandr Valialkin
bf6d523bef lib/backup/fslocal: add FS.MustStop() method for stopping bandwidth limiter 2020-10-09 15:11:55 +03:00
Aliaksandr Valialkin
00b5568ca4 CHANGELOG.md: update with recent changes 2020-10-09 14:22:17 +03:00
Aliaksandr Valialkin
9b7ce5d004 app/{vminsert,vmagent}: take into account all the inserted rows before relabeling in vm_rows_inserted_total and vmagent_rows_inserted_total metrics 2020-10-09 13:38:49 +03:00
Aliaksandr Valialkin
d2e917d1cb app/vmstorage: add vm_rows_added_to_storage_total metric, which shows the total number of rows added to storage since app start 2020-10-09 13:36:17 +03:00
Aliaksandr Valialkin
4b1c401790 app/vmalert: accept days, weeks and years in for: part of config like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/817
2020-10-08 20:13:20 +03:00
Aliaksandr Valialkin
a93c62cd60 lib/promscrape: fix tests after 71ea4935de 2020-10-08 19:32:48 +03:00
Aliaksandr Valialkin
35b8ffaa17 docs/vmagent.md: clarify -promscrape.suppressDuplicateScrapeTargetErrors command-line flag usage 2020-10-08 19:24:05 +03:00
Aliaksandr Valialkin
85a1ab3edd CHANGELOG.md: mention features from 71ea4935de 2020-10-08 19:14:45 +03:00
Aliaksandr Valialkin
0d44e371f3 lib/promscrape: add -promscrape.suppressDuplicateScrapeTargetErrors command-line flag in order to suppress duplicate scrape target errors
Show also original labels for duplicate targets in error message in order to simplify debugging the issue.

Now `/targets` endpoint accepts optional `show_original_labels=1` query arg, which shows original labels for each target.
This may simplify debugging for target relabeling.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/651
2020-10-08 18:59:25 +03:00
Aliaksandr Valialkin
ad0950f630 lib/backup/actions: improve logging to be more clear to humans 2020-10-08 14:23:00 +03:00
Aliaksandr Valialkin
f9f8e4a39c app/vmalert: do not pring description for all the flags on config errors
The description is too big to consume by human and it just distracts humans.
2020-10-08 13:35:46 +03:00
Aliaksandr Valialkin
7786ed95b5 vendor: make vendor-update 2020-10-08 11:52:43 +03:00
Aliaksandr Valialkin
3e2bf32872 vendor: update github.com/VictoriaMetrics/metricsql from v0.7.0 to v0.7.1 2020-10-08 11:47:23 +03:00
Aliaksandr Valialkin
ef9280201c CHANGELOG.md: mentioned about the added optimization that adds missing filters to binary operands 2020-10-07 21:23:16 +03:00
Aliaksandr Valialkin
f6ee6efc34 app/vmselect/promql: add missing label filters to binary operands before query execution
This implements the optimization described at https://utcc.utoronto.ca/~cks/space/blog/sysadmin/PrometheusLabelNonOptimization

See also https://github.com/cortexproject/cortex/issues/3253
2020-10-07 21:17:11 +03:00
Aliaksandr Valialkin
b8024db965 CHANGELOG.md: mention about -finalMergeDelay comand-line flag 2020-10-07 18:52:35 +03:00
Dmitry Shihovtsev
aec863e70b Fix typos in the vmalert datasource (#814)
* Fix typos in the vmalert datasource

* Fix typo in the vmalert datasource test
2020-10-07 18:00:29 +03:00
Artem Navoiev
9f1656145b update go action 2020-10-07 17:48:38 +03:00
Aliaksandr Valialkin
b51fa16177 app/vmstorage: add -finalMergeDelay command-line flag for configuring the delay before final merge for per-month partitions after no new data is ingested to it 2020-10-07 17:42:31 +03:00
Aliaksandr Valialkin
5d5076c4a2 docs/CaseStudies.md: actualize Wix numbers 2020-10-06 16:09:30 +03:00
Aliaksandr Valialkin
b4356550fd CHANGELOG.md: add missing link to an issue about OpenStack service discovery - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728 2020-10-06 15:37:26 +03:00
Aliaksandr Valialkin
c4d309aa41 app/vmagent: add a link to https://victoriametrics.github.io/vmagent.html from main page 2020-10-06 15:30:07 +03:00
Aliaksandr Valialkin
abfd3a8fab app/{vminsert,vmselect,vmstorage}: add a link to https://victoriametrics.github.io/Cluster-VictoriaMetrics.html from main page of every cluster component 2020-10-06 15:30:07 +03:00
Aliaksandr Valialkin
70bb5624ad docs/Articles.md: add https://medium.com/@VictoriaMetrics/anomaly-detection-in-victoriametrics-9528538786a7 2020-10-06 15:30:07 +03:00
Aliaksandr Valialkin
1602f55794 CHANGELOG.md: cut v1.43.0 release 2020-10-06 14:29:06 +03:00
Aliaksandr Valialkin
49451854ef CHANGELOG.md: add missing entries for upcoming release 2020-10-06 12:04:52 +03:00
Aliaksandr Valialkin
54ff78c6c9 lib/protoparser/graphite: support parsing floating-point timestamp like Graphite does
Such timestamps are rounded to seconds like Carbon does.
See b0ba62a62d/lib/carbon/protocols.py (L197)
2020-10-06 11:38:35 +03:00
Aliaksandr Valialkin
1da41177a8 lib/promscrape/discovery/openstack: show expiration time for refreshed OpenStack token in seconds - this is easier to interpret by human 2020-10-06 11:20:36 +03:00
Aliaksandr Valialkin
97b836a6f4 lib/fs: fix GOOS=openbsd build by adding fadviseSequentialRead implementation.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/785
2020-10-05 23:32:28 +03:00
Aliaksandr Valialkin
812c670d60 vendor: make vendor-update 2020-10-05 23:19:49 +03:00
Aliaksandr Valialkin
4e8fad94a5 .github/workflows: switch Go version from v1.14 to v1.15 2020-10-05 22:00:47 +03:00
Aliaksandr Valialkin
50316070d6 lib/promscrape/discovery/openstack: code prettifying after cbe3cf683b
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728
2020-10-05 18:12:31 +03:00
Nikolay Khramchikhin
b4c77fc6d2 Adds openstack sd (#811)
* adds openstack service discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/728

 implemented hypervisors and instance discovery with openstack v3 api.
 Added tests for labeling and data parsing.
 Added token refresh.

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-10-05 16:48:54 +03:00
Aliaksandr Valialkin
5f15c52d21 docs: make docs-sync after cbe3cf683b 2020-10-05 16:48:25 +03:00
Aliaksandr Valialkin
7505e4f390 lib/promrelabel: make a copy of label with new name for action: labelmap in the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/812
2020-10-05 16:20:22 +03:00
Aliaksandr Valialkin
9a7c863bd8 lib/protoparser/influx: add -influx.maxLineSize command-line flag for configuring the maximum size for a single Influx line during parsing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/807
2020-10-05 15:19:11 +03:00
Aliaksandr Valialkin
11ddb79aeb lib/decimal: add tests for negative values passed to maxUpExponent 2020-10-05 14:57:03 +03:00
Aliaksandr Valialkin
d448e07546 lib/decimal: properly calibrate scale for blocks with Inf values
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/805
2020-10-05 14:52:59 +03:00
Aliaksandr Valialkin
3e0c473cc9 app/vmselect/promql: fill gaps on graphs for range_* and running_* functions
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/806
2020-10-02 14:02:47 +03:00
Aliaksandr Valialkin
fd7dd5064a lib/storage: code cleanup after 10f2eedee0
Remove the code that uses metricIDs caches for the current and the previous hour during metricIDs search,
since this code became unused after implementing per-day inverted index almost a year ago.

While at it, fix a bug, which could prevent from finding time series with names containing dots (aka Graphite-like names
such as `foo.bar.baz`).
2020-10-01 19:12:04 +03:00
Aliaksandr Valialkin
3ad7566a87 lib/storage: imrpove cache effectiveness for time series ids matching the given filters
Previously the maximum cache lifetime has been limited by 10 seconds. Now it is extended up to a day.
This should reduce CPU usage in the following cases:

* when querying recently added data with small churn rate for time series
* when querying historical data
2020-10-01 14:39:46 +03:00
Aliaksandr Valialkin
40df42e1e5 docs: make docs-sync 2020-09-30 09:51:16 +03:00
Roman Khavronenko
368b890e11 vmalert: make maxIdleConnections configurable for datasource HTTP client (#797)
Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/795
2020-09-30 09:51:14 +03:00
Aliaksandr Valialkin
af7a0f7aff vendor: make vendor-update 2020-09-30 08:59:53 +03:00
Aliaksandr Valialkin
10b6982ef8 docs/Release-Guide.md: mention that CHANGELOG.md must be updated before release 2020-09-30 08:53:27 +03:00
Aliaksandr Valialkin
ebb1f9ebb6 CHANGELOG.md: release v1.42.0 2020-09-30 08:45:10 +03:00
Aliaksandr Valialkin
081c5d2c74 CHANGELOG.md: initial commit
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/788
2020-09-30 00:12:23 +03:00
Aliaksandr Valialkin
7c2e4e267a lib/storage: allow set values higher than 1 for vm_merge_need_free_disk_space if there are multiple partitions with deferred merges due to disk space shortage 2020-09-29 22:53:34 +03:00
Aliaksandr Valialkin
536aa8779a app/vmstorage: rename vm_{big|small}_merge_need_free_disk_space to vm_merge_need_free_disk_space
This simplifies alerting.
2020-09-29 22:53:33 +03:00
Aliaksandr Valialkin
097a4c10dd app/vmstorage: add metrics for determining whether background merges need additional disk space to complete
These metrics are:

* vm_small_merge_need_free_disk_space
* vm_big_merge_need_free_disk_space

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686
2020-09-29 21:47:47 +03:00
Aliaksandr Valialkin
a4361a4c07 docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-29 20:30:26 +03:00
Aliaksandr Valialkin
7ae0ab976a docs/Single-server-VictoriaMetrics.md: typo fix: compations -> compactions 2020-09-29 20:27:37 +03:00
Aliaksandr Valialkin
5dca7bbe85 app/vmagent/remotewrite: do not show -remoteWrite.url in logs if -remoteWrite.showURL isn't set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/773
2020-09-29 19:49:19 +03:00
Aliaksandr Valialkin
61f4801b93 app/vmselect/graphite: do not substitute path and path. with path.. in /metrics/find/?format=completer output 2020-09-29 18:03:45 +03:00
Aliaksandr Valialkin
5e998e597a lib/cgroup: do not adjust the number of detected CPU cores via /sys/devices/system/cpu/online
The adjustement increases the resulting GOMAXPROC by 1, which looks confusing to users
as outlined at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-698595309
2020-09-29 13:55:53 +03:00
Aliaksandr Valialkin
f4d74ccfec docs/{vmbackup,vmrestore}: formatting fixes 2020-09-29 13:19:13 +03:00
Aliaksandr Valialkin
5179df7082 docs/vmbackup.md: make docs about minio config more prominent 2020-09-29 13:16:34 +03:00
Aliaksandr Valialkin
338a53ccf9 lib/storage: fix tests for 32-bit arches such as GOARCH=386 and GOARCH=arm 2020-09-29 13:10:37 +03:00
Aliaksandr Valialkin
79b00f7b6b docs: improve readability a bit 2020-09-29 13:05:05 +03:00
Nikolay Khramchikhin
d77f56fd97 update vmbackup/vmrestore README usage (#794)
* update vmbackup/vmrestore README usage

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/381

with minio and configuration file examples.

* Apply suggestions from code review

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

* added backup/restore docs changes

* added example for relabelConfig flag

* Apply suggestions from code review

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-29 13:05:04 +03:00
Aliaksandr Valialkin
ef416c72c2 lib/storage: fix 32-bit builds for GOARH=386 or GOARCH=arm 2020-09-29 12:42:25 +03:00
Aliaksandr Valialkin
0b0259c42c lib/protoparser/prometheus: sort rows before comparing them in TestParseStream, since the order for callback calls is non-deterministic 2020-09-29 12:29:50 +03:00
Aliaksandr Valialkin
8f25206d8c lib/protoparser/prometheus: fix TestParseStream after 124f78857b 2020-09-29 12:12:33 +03:00
Aliaksandr Valialkin
0d082cdf53 app/vmselect/netstorage: remove unused code 2020-09-29 11:55:08 +03:00
Aliaksandr Valialkin
c47972d843 app/vmselect/prometheus: check for errors returned from bufferedwriter.Write
This makes `make errcheck` happy
2020-09-29 11:36:54 +03:00
Aliaksandr Valialkin
e66f7edfc9 app/vmselect/graphite: properly handle case when /metrics/find finds both leaf and node for the given query=prefix.*
In this case only node must be returned with stripped dot in the end of id as carbonapi does
2020-09-29 11:29:38 +03:00
Aliaksandr Valialkin
ffa6581c46 app/vminsert: refresh the list of healthy storage nodes only if the the row cannot be sent to destination storage node
Previously the list had been generated for each rerouted row. This could consume additional CPU time during rerouting,
which could lead to rerouting slowdown.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/791
2020-09-29 01:29:24 +03:00
Aliaksandr Valialkin
1e7452e501 .github/workflows: verify builds for vmagent, vmalert, vmbackup and vmrestore 2020-09-29 00:47:54 +03:00
Aliaksandr Valialkin
f68bf12a84 .github/workflows: verify that VictoriaMetrics can be built for GOOS=openbsd 2020-09-29 00:45:38 +03:00
Aliaksandr Valialkin
81cdf2fa14 lib/{fs,filestream}: small consistency-related updates after cc90a548b1 2020-09-29 00:43:20 +03:00
Nikolay Khramchikhin
658a05ef0f added openbsd implementations (#790)
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/785

removed fadvise for openbsd, added freespace implemenation for openbsd
2020-09-29 00:43:19 +03:00
Aliaksandr Valialkin
bc37f1cbec app/vminsert: do not pollute logs with repated cannot dial storageNode errors
Log only the first error per -storageNode
2020-09-29 00:20:32 +03:00
Aliaksandr Valialkin
0548f0c5c8 vendor: make vendor-update 2020-09-28 21:59:20 +03:00
Aliaksandr Valialkin
2ee0dc27a6 app/vmstorage: parallelize data processing obtained from a single connection from vminsert
Previously vmstorage could use only a single CPU core for data processing from a single connection from vminsert.
Now all the CPU cores can be used for data processing from a single connection from vminsert.
This should improve the maximum data ingestion performance for a single vminsert->vmstorage connection.
2020-09-28 21:41:16 +03:00
Aliaksandr Valialkin
9d123eb22a app/vminsert: remove useless delays when sending data to vmstorage
This improves the maximum data ingestion performance for cluster VictoriaMetrics

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/791
2020-09-28 21:41:15 +03:00
Aliaksandr Valialkin
1481d6d8ff lib/protoparser: avoid copying of buffer read from the network to unmarshal buffer 2020-09-28 17:19:04 +03:00
Aliaksandr Valialkin
8df33bd5c1 app/{vminsert,vmagent}: improve data ingestion speed over a single connection
Process data obtianed from a single connection on all the available CPU cores.
2020-09-28 04:14:51 +03:00
Aliaksandr Valialkin
7072db75cb lib/protoparser: use 64KB read buffer instead of default 4KB buffer provided by net/http.Server
This should reduce syscall overhead when reading big amounts of data
2020-09-28 02:07:19 +03:00
Aliaksandr Valialkin
6d8c23fdbd app/{vminsert,vmselect}: skip accountID and projectID when marshaling/unmarshaling MetricName in /api/v1/export/native and /api/v1/import/native
This is needed in order to be able to migrate native data from/to single-node VictoriaMetrics
2020-09-28 00:58:58 +03:00
Aliaksandr Valialkin
db14f22fc0 app/vmselect: stop /api/v1/export/* execution if client disconnects 2020-09-27 23:52:42 +03:00
Aliaksandr Valialkin
aadbd014ff all: add native format for data export and import
The data can be exported via [/api/v1/export/native](https://victoriametrics.github.io/#how-to-export-data-in-native-format) handler
and imported via [/api/v1/import/native](https://victoriametrics.github.io/#how-to-import-data-in-native-format) handler.
2020-09-27 17:36:38 +03:00
Aliaksandr Valialkin
00ec2b7189 lib/protoparser: use all the available CPU cores for processing ingested data from a single /api/v1/import stream
Previously a single data ingestion stream to /api/v1/import could load only a single CPU core.
2020-09-26 04:22:06 +03:00
Aliaksandr Valialkin
e83947a882 app/vminsert: code prettifying 2020-09-26 04:16:14 +03:00
Aliaksandr Valialkin
8fc9b77496 app/vmagent: reduce memory usage when importing data via /api/v1/import
Previously vmagent could use big amounts of RAM when each ingested JSON line
contained many samples.
2020-09-26 04:10:13 +03:00
Aliaksandr Valialkin
973df09686 app/vmselect/netstorage: do not spend CPU time on unpacking empty blocks during /api/v1/series calls 2020-09-24 20:44:15 +03:00
Aliaksandr Valialkin
533bf76a12 lib/storage: correctly use maxBlockSize in various checks
Previously `maxBlockSize` has been multiplied by 8 in certain checks. This is unnecessary.
2020-09-24 18:13:15 +03:00
Aliaksandr Valialkin
b8bce348c5 app/vmselect/promql: properly limit implicitly set rollup window to -search.maxStalenessInterval
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/784
2020-09-23 23:24:09 +03:00
Aliaksandr Valialkin
543f3aea97 all: consistently use "%w" formatting in fmt.Errorf for wrapped errors 2020-09-23 22:48:21 +03:00
Aliaksandr Valialkin
265d892a8c vendor: make vendor-update 2020-09-23 14:23:51 +03:00
Aliaksandr Valialkin
8627365b48 app/vmselect/prometheus: code cleanup after 3ba507000c 2020-09-23 13:04:51 +03:00
Aliaksandr Valialkin
1fce79518a app/vmselect/prometheus: return timestamps from /api/v1/query, which match the time query arg
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/720
2020-09-23 12:59:23 +03:00
Aliaksandr Valialkin
90d2549428 lib/persistentqueue: protect from multiple concurrent opening for the same persistent queue 2020-09-23 02:17:53 +03:00
Aliaksandr Valialkin
0468cdf33e app/vmselect/netstorage: properly pre-allocate space for sbs 2020-09-22 23:51:01 +03:00
Aliaksandr Valialkin
89d652b583 lib/cgroup: attempt to obtain available CPU cores via /sys/devices/system/cpu/online
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685#issuecomment-674423728
2020-09-22 23:27:26 +03:00
Aliaksandr Valialkin
4ebd2fa560 vendor: make vendor-update 2020-09-22 17:08:54 +03:00
Aliaksandr Valialkin
00ff01f766 docs/Single-server-VictoriaMetrics.md: VictoriaMetrics properly stores Inf values after 26115891db 2020-09-22 17:03:16 +03:00
Aliaksandr Valialkin
69eb9783e6 app/vmselect: make sure the request doesnt wait in pending queue more than the configured timeout
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-09-22 01:21:42 +03:00
Aliaksandr Valialkin
31e341371b lib/storage: code prettifying after be5e1222f3
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/781
2020-09-22 00:42:20 +03:00
faceair
ad41e39350 add filter to getMetricIDs (#783)
* add getMetricIDs filter

* check nil filter before use
2020-09-22 00:42:19 +03:00
Aliaksandr Valialkin
ed473c94ff docs/vmagent.md: typo fix 2020-09-21 21:49:08 +03:00
Aliaksandr Valialkin
bedc971398 docs: make docs-sync 2020-09-21 21:48:05 +03:00
Aliaksandr Valialkin
e564725641 app/vmselect/searchutils: fixed tests after 2eb72e09ab 2020-09-21 21:31:28 +03:00
Aliaksandr Valialkin
07c6226334 app/vmselect: use time value rounded to seconds if it isnt passed to /api/v1/query
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/720
2020-09-21 21:24:46 +03:00
Aliaksandr Valialkin
d32c3f747c lib/logger: add -loggerDisableTimestamps command-line flag for disabling timestamps in logs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/778
2020-09-21 19:25:50 +03:00
Aliaksandr Valialkin
f961838290 lib/promscrape/discovery/ec2: code prettifying after 312fead9a2 2020-09-21 18:44:04 +03:00
Nikolay Khramchikhin
0069353d5e Add improvements to ec2_sd_discovery (#775)
* Add improvements to ec2 discovery

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/771

 role_arn support with aws sts
 instance iam_role support
 refreshing temporary tokens

* Apply suggestions from code review

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

* changed implementation, removed tests, clean up code

* moved endpoint builder into getEC2APIResponse

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-21 16:05:01 +03:00
Aliaksandr Valialkin
8cd89cb847 app/vmalert: remove unneeded UTC() call
UTC() doesn't change the underlying timestamp, so the call isn't needed here
2020-09-21 15:56:48 +03:00
Roman Khavronenko
d111969d39 vmalert: add support for datasource.lookback flag (#779)
New datasource flag `datasource.lookback` defines how far to look into
past when evaluating queries.

Address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/668
2020-09-21 15:56:47 +03:00
Aliaksandr Valialkin
a9321f6a60 lib/storage: reduce CPU load for idle VictoriaMetrics by reducing the frequency for the need for background merges 2020-09-21 15:51:26 +03:00
Roman Khavronenko
0042b0f307 vmalert: fix the typo in error message (#782)
The error will be always nil so no sense in printing it.
2020-09-21 11:36:09 +03:00
Aleksey Shirokih
a7e17f14f5 dashboard: add account id to datapoints ingestion rate (#772) 2020-09-19 22:00:16 +01:00
Aliaksandr Valialkin
604e8f6114 lib/decimal: optimize maxUpExponent() by eliminating division from hot path 2020-09-19 13:49:43 +03:00
Aliaksandr Valialkin
df547bf345 lib/persistentqueue: sync data to file inside filestream.Writer.MustFlush 2020-09-19 12:51:46 +03:00
Aliaksandr Valialkin
778ea183ca lib/decimal: properly store Inf values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-18 19:08:53 +03:00
Aliaksandr Valialkin
1c07d7bee7 app/vmagent: increase default value for -remoteWrite.queues from 1 to 4, since it has been appeared that many users hit this limit 2020-09-18 14:22:02 +03:00
Aliaksandr Valialkin
5bd30171b3 vendor: update github.com/valyala/quicktemplate from v1.6.2 to v1.6.3 2020-09-18 13:10:55 +03:00
Aliaksandr Valialkin
adb4d3b75c lib/persistentqueue: flush data to disk every second
Previously small amounts of data may be left unflushed for extended periods of time if vmagent collects small amounts of data.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/687
2020-09-18 13:06:03 +03:00
Aliaksandr Valialkin
8fd791d399 lib/promscrape: avoid copying response body when scraping targets.
This should reduce memory usage when scraping targets with millions of metrics.
2020-09-18 13:06:00 +03:00
Aliaksandr Valialkin
e25b90849f vendor: udpate github.com/VictoriaMetrics/fasthttp from v1.0.5 to v1.0.7 2020-09-18 12:20:48 +03:00
Aliaksandr Valialkin
a1bebb660c app/vmselect/graphite: return proper results /metrics/find?query=foo.*.bar according to Graphite Metrics API 2020-09-18 11:53:28 +03:00
Aliaksandr Valialkin
9b15b11f74 app/vmstorage: added -forceMergeAuthKey command-line flag for protecting /internal/force_merge endpoint 2020-09-17 14:24:20 +03:00
Aliaksandr Valialkin
d96858b921 lib/storage: add /internal/force_merge handler for running forced compactions on historical per-month partitions
This may be useful for freeing up storage space after time series deletion.

See https://victoriametrics.github.io/#force-merge for more details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686
2020-09-17 12:20:56 +03:00
Aliaksandr Valialkin
3abbb38254 lib/{mergeset,storage}: compare errors with errors.Is() 2020-09-17 03:03:10 +03:00
Aliaksandr Valialkin
ddb3519e17 lib/{mergeset,storage}: code prettifying 2020-09-17 02:06:37 +03:00
Aliaksandr Valialkin
bf826dd828 lib/storage: removed duplicate checks for empty parts during merge - another check is in the beginning of mergeParts functions 2020-09-17 01:49:08 +03:00
Aliaksandr Valialkin
7429fb2301 vendor: make vendor-update 2020-09-17 01:44:16 +03:00
Aliaksandr Valialkin
2084496462 docs/Single-server-VictoriaMetrics.md: document that /api/v1/series/count may count delete time series
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/770
2020-09-17 01:39:09 +03:00
Aliaksandr Valialkin
406f4fe445 app/vmagent: substitute -remoteWrite.url with secret-url value in logs, since it may contain sensitive info such as passwords or auth tokens
Pass `-remoteWrite.showURL` command-line flag in order to see real `-remoteWrite.url` values in logs and at `/metrics` page.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/773
2020-09-16 22:36:18 +03:00
Aliaksandr Valialkin
9793008734 app/vmselect: add -search.storageTimeout command-line flag for limiting the maximum duration of query execution per each -storageNode
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-09-16 21:33:47 +03:00
Aliaksandr Valialkin
9705ac5d7a lib/persistentqueue: code simplification after d455764a6f 2020-09-16 21:14:01 +03:00
Aliaksandr Valialkin
a9205fe308 app/vmselect: prevent from closing connection to vmstorage on query timeout by setting +2 secs deadline on connection comparing to query deadline
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/711
2020-09-16 21:14:00 +03:00
Aliaksandr Valialkin
eee6f1e56d lib/persistentqueue: make the persistent queue more durable against unclean shutdown (kill -9, OOM, hard reset)
The strategy is:

- Periodical flushing of inmemory blocks to files, so they aren't lost on unclean shutdown.
- Periodical syncing of metadata for persisted queues, so the metadata remains in sync with the persisted data.
- Automatic adjusting of too big chunk size when opening the queue. The chunk size may be bigger than the writer offset after unclean shutdown.
- Skipping of broken chunk file if it cannot be read.
- Fsyncing finalized chunk files.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/687
2020-09-16 18:13:24 +03:00
Aliaksandr Valialkin
6ce52e3702 lib/protoparser/vmimport: add more testcases for invalid timestamps and values
Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 02:21:53 +03:00
Aliaksandr Valialkin
cd87ca303f lib/protoparser: report more errors for incorrect timestamps and/or values
Previously certain errors in timestamps and/or values could be silently skipped,
which could lead to samples with zero values stored in the database.

Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 02:16:15 +03:00
Aliaksandr Valialkin
5c4e111b43 lib/protoparser/graphite: return error when value or timestamp cannot be properly parsed
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/99
2020-09-16 02:16:13 +03:00
Aliaksandr Valialkin
1587f83fa0 app/vmselect/netstorage: typo fix after 03dfccfbed 2020-09-16 00:10:33 +03:00
Aliaksandr Valialkin
2a86e2fb98 vendor: update github.com/valyala/fastjson from v1.5.4 to v1.5.5
This should properly parse `+Inf` values when importing JSON lines via `/api/v1/import`

Updates https://github.com/VictoriaMetrics/vmctl/issues/25
2020-09-16 00:09:05 +03:00
Aliaksandr Valialkin
d39a985d6b docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics ignores NaN and Inf values during data ingestion
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-15 23:41:10 +03:00
Aliaksandr Valialkin
03dfccfbed app/vmselect/netstorage: reduce memory usage when the time range from query touches big number of samples per each time series 2020-09-15 21:08:09 +03:00
Aliaksandr Valialkin
27cd5555e6 app/vmselect/netstorage: mention that RunParallel or Cancel must be called on the returned results from ProcessSearchQuery 2020-09-15 20:39:43 +03:00
Aliaksandr Valialkin
1f0c0b0f6b app/vmselect: typo fix in -search.maxStalenessInterval description 2020-09-15 14:25:34 +03:00
Aliaksandr Valialkin
0c1c1b79ba lib/promscrape: add a link to troubleshooting docs to error message when duplicate scrape target with identical labels is skipped 2020-09-15 14:16:20 +03:00
Aliaksandr Valialkin
5f04b00b2d docs/Articles.md: add a link to https://medium.com/miro-engineering/prometheus-high-availability-and-fault-tolerance-strategy-long-term-storage-with-victoriametrics-82f6f3f0409e 2020-09-15 12:29:44 +03:00
Aliaksandr Valialkin
ca08161b54 lib/promscrape: typo fix 2020-09-12 00:14:15 +03:00
Roman Khavronenko
e2b31590e6 vmalert: add Group name as label to generated alerts and timeseries (#761)
Solves #611
2020-09-11 23:41:12 +03:00
Roman Khavronenko
16e0bb496e vmalert: update groups on config reload only if changes detected (#759)
On config reload event `vmalert` reloads configuration for every group. While
it works for simple configurations, the more complex and heavy installations may
suffer from frequent config reloads.
The change introduces the `checksum` field for every group and is set to md5 hash
of yaml configuration. The checksum will change if on any change to group
definition like rules order or annotation change. Comparing the `checksum` field
on config reload event helps to detect if group should be updated.
The groups update is now done concurrently, so reload duration will be limited by
the slowest group now.

Partially solves #691 by improving config reload speed.
2020-09-11 23:41:12 +03:00
Aliaksandr Valialkin
e53235ac5c lib/promscrape: do not reset the remaining rows when pushing a part of data to remote storage during big scrapes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/753

Thanks to @PerGon and @clmssz for help with debugging.
2020-09-11 23:38:17 +03:00
Aliaksandr Valialkin
b776a93608 app/vmselect/promql: support composite durations like Prometheus 2.21 does
The following durations are supported now: `1h5m35s` or `1s543ms`

See https://github.com/prometheus/prometheus/releases/tag/v2.21.0
and https://github.com/prometheus/prometheus/pull/7713
2020-09-11 22:17:24 +03:00
Aliaksandr Valialkin
114cf24b43 lib/promscrape/discovery/dns: add __meta_dns_srv_record_target and __meta_dns_srv_record_port labels
This syncs dns service discovery with Prometheus 2.21 - see https://github.com/prometheus/prometheus/releases
and https://github.com/prometheus/prometheus/pull/7678 .
2020-09-11 21:35:39 +03:00
Aliaksandr Valialkin
6382e8081a app/vmagent: allow setting multiple identical -remoteWrite.url values
This may be useful when each url is authenticated via different `-remoteWrite.basicAuth.username`.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/755
2020-09-11 15:17:29 +03:00
Aliaksandr Valialkin
5cf5a0e8c4 lib/protoparser/common: do not read request body when parsing timestamp query arg
This was preventing from reading data via /api/v1/prometheus/import .

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/750
2020-09-11 14:45:21 +03:00
Aliaksandr Valialkin
81c05f669b lib/storage: do not store inf values, since they may lead to significant precision loss for previously stored values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/752
2020-09-11 14:45:20 +03:00
Aliaksandr Valialkin
8bd5aa3516 lib/protoparser: accept timestamp in milliseconds instead of seconds at /api/v1/import/prometheus
This improves consistency with timestamps in Prometheus text exposition format
2020-09-11 14:05:24 +03:00
Aliaksandr Valialkin
d3ad0d365e app/vmselect: move Deadline from netstorage to searchutils
This removes dependency on netstorage from searchutils.
2020-09-11 13:39:13 +03:00
Aliaksandr Valialkin
58d3b82ae5 app/{vminsert,vmagent}: allow passing timestamp via timestamp query arg when ingesting data to /api/v1/import/prometheus
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/750
2020-09-11 13:28:31 +03:00
Nikolay Khramchikhin
af994562c8 Added endpointslices discovery to k8s api (#760)
This is similar to https://github.com/prometheus/prometheus/pull/6838 , which will be added in Prometheus v2.21.
See https://github.com/prometheus/prometheus/releases/tag/v2.21.0-rc.1

* Added endpointslices discovery to k8s api

Started from 1.17 k8s version endpointslices is beta,
it allows to query k8s api for endpoints more efficient.
It presents at scrape_config.yaml as separate role for kubernetes_sd_config.
kubernetes_sd_config:
- role: endpointslices

* fixed typos, changed EndpointConditions signature - with values instead of pointers
2020-09-11 12:24:50 +03:00
Aliaksandr Valialkin
579c20756a app/vmselect: substitute inf values at smooth_exponential with the previous values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/757
2020-09-11 12:23:56 +03:00
Aliaksandr Valialkin
d67e6d3d2e app/vmselect: skip infinite values when calculating smooth_exponential
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/757
2020-09-11 11:57:53 +03:00
John Belmonte
a72a2566d7 fix typo on outliersk() doc (#758) 2020-09-11 01:59:55 +03:00
Aliaksandr Valialkin
06427a184f app/vmselect/graphite: typo fix in label name for vm_request_duration_seconds metric 2020-09-11 01:59:52 +03:00
Aliaksandr Valialkin
f307e6f432 app/vmselect: initial implementation of Graphite Metrics API
See https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api
2020-09-11 00:30:20 +03:00
Aliaksandr Valialkin
87f916a2fb vendor: make vendor-update 2020-09-10 09:50:00 +03:00
Aliaksandr Valialkin
5afe69407d deployment/docker: update Go builder from v1.15.1 to v1.15.2
This fixes the following issues in Go runtime - see https://github.com/golang/go/issues?q=milestone%3AGo1.15.2+label%3ACherryPickApproved
2020-09-10 09:38:10 +03:00
Aliaksandr Valialkin
f5cb213ef9 lib/storage: reuse timestamp blocks for adjancent metric blocks with identical timestamps
This should reduce disk space usage when scraping targets containing metrics with identical names
such as `node_cpu_seconds_total`, histograms, quantiles, etc.

Expose `vm_timestamps_blocks_merged_total` and `vm_timestamps_bytes_saved_total` metrics for monitoring
the effectiveness of timestamp blocks merging.
2020-09-09 23:59:21 +03:00
Aliaksandr Valialkin
475698d2ad docs: sync docs for vmalert, vmauth, vmbackup and vmrestore 2020-09-09 21:10:48 +03:00
Aliaksandr Valialkin
d72fd86488 docs/Articles.md: add links to recently published third-party articles and talks about VictoriaMetrics 2020-09-09 20:15:44 +03:00
Aliaksandr Valialkin
5a82f645e4 docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-09 01:22:18 +03:00
Aliaksandr Valialkin
95ad657a1c docs/Single-server-VictoriaMetrics.md: typo fix 2020-09-09 01:00:05 +03:00
Aliaksandr Valialkin
5ab57f916b docs/vmagent.md: clarified the case when -remoteWrite.queues must be tuned
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/745
2020-09-08 20:15:49 +03:00
Aliaksandr Valialkin
e5c8377212 lib/httpserver: add a jitter to connection timeouts in order to protect from Thundering herd problem 2020-09-08 19:57:20 +03:00
Aliaksandr Valialkin
df6519c190 vendor: make vendor-update 2020-09-08 15:18:59 +03:00
Aliaksandr Valialkin
f3a79abfb4 app/vmselect/promql: go fmt 2020-09-08 15:18:57 +03:00
Aliaksandr Valialkin
4f06eed1c1 app/vmselect/promql: adjust integrate() calculations to be more similar to calculations from InfluxDB: attempt #2
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/701
2020-09-08 14:36:23 +03:00
Aliaksandr Valialkin
0d0b606455 app/vmselect/promql: adjust integrate() calculations to be more similar to calculations from InfluxDB
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/701
2020-09-08 14:24:02 +03:00
Aliaksandr Valialkin
db91045348 app/vmselect/promql: increase floating point calculations accuracy by dividing by 1e3 instead of multiplying by 1e-3 2020-09-08 14:01:02 +03:00
Aliaksandr Valialkin
13dd915302 docs/Single-server-VictoriaMetrics.md: make docs-sync 2020-09-07 21:58:48 +03:00
Nikolay Khramchikhin
fb356c434b Changed s3 configProfile flag default, (#749)
aws sdk has complicated logic for chosing profile name and we shouldn't set
it to `default` value. It leads to bugs and improper configuration.
Set it to empty value by default is safe. It will be automatically set to `default` by sdk.
2020-09-07 21:55:45 +03:00
Aliaksandr Valialkin
5cb8c82fe5 docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-09-04 03:30:43 +03:00
Aliaksandr Valialkin
38462bd95e docs/Single-server-VictoriaMetrics.md: updates according to review comments at fe98ba5a60 2020-09-04 03:08:20 +03:00
Aliaksandr Valialkin
804304c365 app/vmselect: add missing deletion for temporary files on partial responses when -search.denyPartialResponse=true 2020-09-04 02:23:12 +03:00
Aliaksandr Valialkin
d2f6f96e4a lib/memory: fall back to reading hierarchical memory limit in cgroups when the default limit isn't set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/699
2020-09-04 00:04:57 +03:00
Aliaksandr Valialkin
281d715060 lib/httpserver: add -http.connTimeout command-line flag for limiting the lifetime for incoming http connections
This can be useful for balancing incoming connections among multiple services.
2020-09-03 22:23:55 +03:00
Aliaksandr Valialkin
fcde009e11 vendor: update github.com/VictoriaMetrics/metricsql from v0.4.3 to v0.5.1
The new version of the package supports binary operations on string literals:

    * "foo" + "bar"     => "foobar"
    * "foo" == "bar"    => NaN
    * "foo" == "foo"    => 1
    * "foo" >bool "bar" => 1
    * "foo" < "bar"     => NaN

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/717
2020-09-03 16:33:23 +03:00
Aliaksandr Valialkin
478d8f8393 app/vmselect/promql: add count_le_over_time(m[d], le) and count_gt_over_time(m[d], gt) functions
These functions returns the number of raw samples that don't exceed `le` or are bigger than `gt`.
These functions are complement to already existing `share_le_over_time(m[d], le)` and `share_gt_over_time(m[d], gt)`.
2020-09-03 15:28:58 +03:00
Aliaksandr Valialkin
c3a2d4ee6f vendor: update github.com/VictoriaMetrics/metricsql from v0.4.1 to v0.4.2
The new version of this package properly supports escaped identifiers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/743
2020-09-03 14:51:06 +03:00
Aliaksandr Valialkin
3490160fd0 app/vmselect: unconditionally align time range boundaries to step for subqueries as Prometheus does 2020-09-03 13:22:06 +03:00
Aliaksandr Valialkin
7ab7ae79c7 docs/Single-server-VictoriaMetrics.md: mention that /api/v1/series returns series for the last 5 minutes if start query arg is missing
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/735
2020-09-03 12:39:31 +03:00
Aliaksandr Valialkin
a3cdef6b06 app/vmagent: properly flush big blocks of data
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/741

Thanks to @IceRain00 for the investigation and initial attempt to fix the issue
at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/742
2020-09-03 12:12:12 +03:00
Aliaksandr Valialkin
de216bab41 app/vmagent: fix data race when accessing writeRequest.lastFlushTime 2020-09-03 12:12:09 +03:00
Nikolay Khramchikhin
80a9dc79fe changed vmalert behaviour (#738)
* VMAlert start with empty rules dir

There are some applications (operator for instance), that generates alerts configuration at runtime
and vmalert must start correctly without rules to support this behaviour.
Later application will add rules files and send SIGHUP to vmalert,
which will trigger reading rules files and start rules exectuion.

Removing rules files with SIGHUP signal must stop rules execution and
vmalert will wait for new rules.

* imports sorted

* added test cases for empty rules, removed blank line

* fixed imports conflict

* updated tests
2020-09-03 11:07:40 +03:00
Aliaksandr Valialkin
4388c6cad1 docs/Single-server-VictoriaMetrics.md: add missing link to Prometheus text exposition format 2020-09-03 01:14:33 +03:00
Aliaksandr Valialkin
7ac10ee978 app/vmalert: imrovements over 3f932c2db1 2020-09-03 01:14:30 +03:00
DexterZhang
85f49ad439 feat: spread load of rule evaluation by group when starting new groups (#724)
* feat: spread load of rule evaluation by group when starting new groups

* review: reduce the resulting diff.

* Update app/vmalert/group.go

Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
Co-authored-by: Roman Khavronenko <hagen1778@gmail.com>
2020-09-03 01:14:26 +03:00
Aliaksandr Valialkin
4fa97430d7 app/{vminsert,vmagent}: allow adding extra labels when importing data via Prometheus, CSV and JSON line formats
Extra labels may be added to the imported data by passing `extra_label=name=value` query args.
Multiple query args may be passed in order to add multiple extra labels.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/719
2020-09-02 19:47:02 +03:00
Aliaksandr Valialkin
95ce89e7d7 lib/promscrape: use the number of parsed rows as a basis for writeRequestCtxPool leveling
The previous basis on `cap(sw.labels)` doesn't work anymore after 7785869ccc ,
because `sw.labels` may be reset multiple times when processing big number of rows.
2020-09-02 18:46:55 +03:00
Roman Khavronenko
432c0383db Single dashboards update (#736)
* dashboard: rename var `datasource` to `ds` for consistency reason

Dasbhoards for cluster version or vmagent operate with datasource variable
named `ds`. For consistency sake we rename this variable in single node version
as well.

* dashboard: add instance variable picker

See dashboard reviews here https://grafana.com/grafana/dashboards/10229/reviews

* dashboard: limit number of buckets in histogram to 12 for vmagent dashboard

* dashboard: bump version requirement in description for single version

* dashboard: drop extra series override for single version

* dashboard: set Y-min to zero for most of panels in vmagent dashboard
2020-09-02 15:18:29 +03:00
Roman Khavronenko
801a26340f dashboard: set Y-min to zero for most of panels in cluster dashboard (#737)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-09-02 15:15:13 +03:00
Aliaksandr Valialkin
63052f80fb deployment/docker: update Go builder from v1.15.0 to v1.15.1 2020-09-02 15:10:07 +03:00
John Belmonte
5ea6f86dd8 use Y-min 0 on Grafana dashboard graphs (#732) 2020-09-02 15:06:17 +03:00
Aliaksandr Valialkin
74ba0a6271 vendor: make vendor-update 2020-09-01 17:43:27 +03:00
Aliaksandr Valialkin
bc1ca4b20b lib/httpserver: add -http.idleConnTimeout command-line flag for tuning the timeout for incoming idle http connections 2020-09-01 15:33:31 +03:00
Aliaksandr Valialkin
a01c56104a lib/promscrape: fix applying sample_limit when scraping targets with big number of metrics
This has been broken at 7785869ccc
2020-09-01 11:09:25 +03:00
Aliaksandr Valialkin
deff8d419a lib/promscrape: reduce memory usage when scraping targets with millions of metrics
This should help when scraping /federate endpoints from Prometheus instances,
which scrape millions of metrics. See https://prometheus.io/docs/prometheus/latest/federation/
2020-09-01 10:55:24 +03:00
Aliaksandr Valialkin
fe08b1eb26 app/vminsert: improve error message when the data cannot be sent to vmstorage - log reroutedBR buffer size
This should improve debuggability for improperly configured cluster
2020-08-31 17:51:44 +03:00
Aliaksandr Valialkin
6f9c1bc078 app/vmagent: log unsuccessful attempt number when sending data to -remoteWrite.url 2020-08-30 21:40:15 +03:00
Aliaksandr Valialkin
3b1ecac04b app/vmagent: apply sane limits to -remoteWrite.queues
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/707
2020-08-30 21:25:51 +03:00
Aliaksandr Valialkin
49140edd41 docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics accepts relative times at time, start and end query args 2020-08-28 10:14:33 +03:00
Aliaksandr Valialkin
7e74cf4d71 docs/vmalert.md: sync with app/vmalert/README.md via make docs-update 2020-08-28 09:52:10 +03:00
Aliaksandr Valialkin
815dffabed docs/Articles.md: add a link to https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-filtering-and-modifying-time-series-6d40cea4bf21 2020-08-28 09:52:09 +03:00
Aliaksandr Valialkin
5f2277624a lib/{promscrape,leveledbytebufferpool}: rename getPoolIdAndCapacity to getPoolIDAndCapacity in order to make golint happy 2020-08-28 09:49:22 +03:00
Aliaksandr Valialkin
45e770ed20 lib/cgroup: limit the maximum GOMAXPROCS value to the number of available CPU cores
There is no sense in setting GOMAXPROCS to value higher than the number of available CPU cores.
2020-08-28 09:49:22 +03:00
Roman Khavronenko
08b76cb26f vmalert: update -rule flag description to enforce quotes using (#709)
Description for `-rule` flag uses as example specific chars like asterisks
which could be interpreted wrong by different shells. To avoid this, description
now contains quoted flag values.

See also #708
2020-08-28 09:46:35 +03:00
Roman Khavronenko
34ef10fbcc lib/flagutil: avoid int overflow for arch 386 (#710)
Arch 386 is a 32-bit architecture and interprets int type for numbers as an explicit int32,
whereas on most modern CPUs int is implicitly an int64. This makes tests to fail with
`int overflow` error.
2020-08-28 09:46:35 +03:00
Aliaksandr Valialkin
9a77ae9d1c lib/promscrape: reduce memory usage when scraping targets with big number of metrics alongside targets with small number of labels
Previously targets with big number of metrics and/or labels could generated too big buffers,
which then could be re-used when scraping targets with small number of metrics.
This resulted in memory waste.

Now big buffers are used only for targets with big number of metrics / labels,
while small buffers are used for targets with small number of metrics / labels.
2020-08-16 22:30:34 +03:00
Aliaksandr Valialkin
3ea6444219 lib/leveledbytebufferpool: allocate byte buffers with capacity rounded to the upper boundary for the given bucket
This should reduce the number of resizings for the returned byte buffers.
2020-08-16 22:13:38 +03:00
Roman Khavronenko
4b89da9463 lib/decimal: rename significant decimal digits to significant figures (#698)
The previous notion was inconsistent with what `decimal.Round` does.
According to [wiki](https://en.wikipedia.org/wiki/Significant_figures) rounding
applied to all significant figures, not just decimal ones.
2020-08-16 17:22:40 +03:00
Aliaksandr Valialkin
6aab2f4989 all: allow using KB, MB, GB, KiB, MiB and GiB suffixes in command-line flag values related to byte sizes or byte rates 2020-08-16 17:08:28 +03:00
Aliaksandr Valialkin
1b5467f7fd lib/memory: improve log message about the memory allowed to use by VictoriaMetrics 2020-08-16 17:08:27 +03:00
Aliaksandr Valialkin
d9f7ea1c6e lib/protoparser: removed unnecessary call to SetReadDeadline when reading a stream of data
The OS should return any buffered data in the stream without the need to set the read timeout.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-15 15:38:35 +03:00
Aliaksandr Valialkin
98217e4c40 vendor: upgrade github.com/valyala/gozstd from v1.7.1 to v1.8.3 2020-08-15 15:11:50 +03:00
Aliaksandr Valialkin
008a769434 vendor: downgrade github.com/valyala/gozstd from v1.8.1 to v1.7.1 until https://github.com/facebook/zstd/issues/2222 is fixed 2020-08-15 14:51:16 +03:00
Aliaksandr Valialkin
edf3ee2a9b lib: dump compressed block contents on error during decompression
This should improve detecting root cause for https://github.com/facebook/zstd/issues/2222
2020-08-15 14:51:14 +03:00
Aliaksandr Valialkin
7fb942308c vendor: update github.com/valyala/gozstd from v1.7.0 to v1.8.1 2020-08-15 13:46:34 +03:00
Aliaksandr Valialkin
6e863376f7 lib/leveledbytebufferpool: pre-allocate byte slice with the given capacity if the pool is empty
This should reduce memory allocations and copying when the byte slice is growing.
2020-08-15 01:41:59 +03:00
Aliaksandr Valialkin
285665e93b app/vmselect/promql: allow passing multiple args to aggregate functions such as avg(q1, q2, q3) 2020-08-15 01:15:16 +03:00
Aliaksandr Valialkin
a2021d0dde docs/vmagent.md: mention that gaps in remote storage may appear if vmagent cannot keep up with data ingestion 2020-08-14 20:48:17 +03:00
Aliaksandr Valialkin
3efa4e4e1c lib/protoparser: move common code for detecting timeouts to ReadLinesBlockExt
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-14 20:39:51 +03:00
Aliaksandr Valialkin
c6b0547847 lib/protoparser: prevent from busy loop on repeated timeout errors when reading streams of ingested data
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/696
2020-08-14 20:13:37 +03:00
Aliaksandr Valialkin
9d79a3a99d lib/memory: add -memory.allowedBytes command-line flag for setting absolute memory limit for VictoriaMetrics caches 2020-08-14 19:19:10 +03:00
Aliaksandr Valialkin
e7c0b2ca56 docs: update docs 2020-08-14 19:14:46 +03:00
Aliaksandr Valialkin
b996280c65 app/{vminsert,vmagent}: improve documentation for -influxListenAddr command-line flag 2020-08-14 18:03:08 +03:00
Aliaksandr Valialkin
c82a485cf6 lib/protoparser/prometheus: typo fix in error message 2020-08-14 11:04:15 +03:00
Aliaksandr Valialkin
f4c90449dc deployment/dm: update images 2020-08-14 02:27:55 +03:00
Aliaksandr Valialkin
0a34f56b39 vendor: update github.com/VictoriaMetrics/fasthttp from v1.0.4 to v1.0.5 2020-08-14 02:19:56 +03:00
Aliaksandr Valialkin
d615ae81e5 vendor: update github.com/klauspost/compress from v1.10.10 to v1.10.11 2020-08-14 02:16:39 +03:00
Aliaksandr Valialkin
9e67343756 lib/promscrape: use a hint on body length instead of body capacity
This should reduce memory usage for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/689
2020-08-14 01:17:46 +03:00
Aliaksandr Valialkin
b4119bb51e lib/promscrape: reduce memory usage when scraping big number of targets
Thanks to @dxtrzhang for the original idea at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/688

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/689
2020-08-14 01:05:04 +03:00
Aliaksandr Valialkin
1724cc241e lib/promscrape: properly retry requests on the server closed connection before returning the first response byte error during service discover API calls and target scrapes 2020-08-13 22:32:29 +03:00
Aliaksandr Valialkin
60c7397be5 all: support %{ENV_VAR} placeholders in yaml configs in all the vm* components
Such placeholders are substituted by the corresponding environment variable values.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/583
2020-08-13 17:17:06 +03:00
Aliaksandr Valialkin
aa7f1a9d8f deployment/docker: update Go builder from Go1.14.7 to Go1.15.0 2020-08-13 15:53:39 +03:00
Aliaksandr Valialkin
c5d50a5940 docs/Cluster-VictoriaMetrics.md: mention about Kubernetes operator 2020-08-12 21:15:01 +03:00
Aliaksandr Valialkin
801a0241fc docs/Single-server-VictoriaMetrics.md: mention helm charts, k8s operator and vmctl tool in Integrations chapter 2020-08-12 21:12:45 +03:00
Aliaksandr Valialkin
f20032dbb5 docs/Articles.md: added https://medium.com/@romanhavronenko/victoriametrics-how-to-migrate-data-from-prometheus-d44a6728f043 2020-08-12 21:03:25 +03:00
Aliaksandr Valialkin
6721e47ae9 app: respect CPU limits set via cgroups
Update GOMAXPROCS to limits set via cgroups. This should reduce CPU trashing and reduce memory usage
for cases when VictoriaMetrics components run in containers with CPU limits.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685
2020-08-11 23:01:03 +03:00
Aliaksandr Valialkin
59d95961b8 lib/protoparser: clarify that the string passed to Unmarshal() function must remain available when the parsed rows are in use 2020-08-11 17:05:21 +03:00
Aliaksandr Valialkin
315b752245 docs/Single-server-VictoriaMetrics.md: mention that it is safe to skip multiple versions during the upgrade 2020-08-11 14:22:24 +03:00
Aliaksandr Valialkin
62b6e54622 app/vmselect: reduce memory usage when exporting time series with big number of samples via /api/v1/export if max_rows_per_line is set to non-zero value
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/685
2020-08-10 20:57:43 +03:00
Aliaksandr Valialkin
890cfe5b61 lib/protoparser/influx: accept precision=us and precision=µ according to https://docs.influxdata.com/influxdb/v1.8/tools/api/#write-http-endpoint 2020-08-10 20:23:20 +03:00
Aliaksandr Valialkin
e3439a6cd0 lib/promscrape: optimize per-metric hash calculations
This increases vmagent performance by up to 10% when scraping big number of metrics
2020-08-10 19:47:50 +03:00
Aliaksandr Valialkin
c9f5c5623f app/vmselect/netstorage: vary batch size for data unpacking depending on the available CPU cores
This should reduce contention on the channel with unpack work for systems with high number of CPU cores
2020-08-10 15:16:48 +03:00
Aliaksandr Valialkin
4ce1368e4b lib/storage: mention time range used in the query that led to error message
This should improve detecting slow queries with too big time ranges
2020-08-10 13:46:29 +03:00
Aliaksandr Valialkin
f92255e803 lib/storage: mention tag filters used in the query that led to error message
This should improve detecting invalid or heavy queries that lead to errors.
2020-08-10 13:36:54 +03:00
Aliaksandr Valialkin
b3d4ff7ee2 app/vmstorage: improve error logging when the request times out 2020-08-10 13:17:24 +03:00
Aliaksandr Valialkin
e3999ac010 lib/promscrape: show real timestamp and real duration for the scape on /targets page
Previously the scrape duration may be negative when calculated scrape timestamp drifts away from the real scrape timestamp
2020-08-10 12:40:49 +03:00
Aliaksandr Valialkin
43830b1699 vendor: make vendor-update 2020-08-09 15:14:09 +03:00
Aliaksandr Valialkin
c09c881264 lib/promscrape: make errcheck happy 2020-08-09 13:17:30 +03:00
Aliaksandr Valialkin
2dfb42a8b4 lib/promscrape: export scrape_samples_added per-target metric like Prometheus does
This metric may be useful for detecting targets with high churn rate for the exported metrics.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/683
2020-08-09 12:45:30 +03:00
Aliaksandr Valialkin
fd9f1463df lib/fs: use WARN instead of ERROR log level for the message when NFS diretory removal temporarily fails
this is expected condition, so it is better to use WARN log level for it
2020-08-09 12:07:35 +03:00
Aliaksandr Valialkin
d4be3efc60 lib/promscrape: add a test for scrape config for blackbox exporter
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/684
2020-08-09 12:03:51 +03:00
Roman Khavronenko
78afc61896 app/vmalert: extend metrics set exported by vmalert #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573

New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
 ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;

Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.

Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13

* app/vmalert: extend metrics set exported by `vmalert` #573

The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.

The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 09:42:05 +03:00
ofen
3fea7c39be 401 Unauthorize HTTP error added (#681)
401 Unauthorize HTTP error added to trigger browser credentials pop-up promt [RFC 7235 https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication]
2020-08-09 09:39:37 +03:00
Aliaksandr Valialkin
d8aa433c4d vendor: update github.com/VictoriaMetrics/metrics from v1.12.2 to v1.12.3 2020-08-07 13:02:35 +03:00
Aliaksandr Valialkin
67cacb22ac lib/httpserver: add -tls, -tlsCertFile and -tlsKeyFile command-line flags in every vm binary
This makes such binaries compatible with binaries from `master` branch (aka single-node version)

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/677
2020-08-07 10:57:32 +03:00
Aliaksandr Valialkin
307281e922 lib/storage: slow down concurrent searches when the number of concurrent inserts reaches the limit
This should improve data ingestion performance when heavy searches are executed

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/618
2020-08-07 08:49:13 +03:00
Aliaksandr Valialkin
dd1d59f57a lib/storage: properly check timeouts and pace limits
Previously they were checked on every iteration for small number of iterations
2020-08-07 08:40:56 +03:00
Aliaksandr Valialkin
2704722b6d deployment/docker: update Go builder from v1.14.6 to v1.14.7 2020-08-07 08:29:25 +03:00
Aliaksandr Valialkin
87d7710c8e docs/MetricsQL.md: mention that MetricsQL removes all the NaN values from results 2020-08-07 07:51:40 +03:00
Aliaksandr Valialkin
95a8c492ef app/vmselect/promql: properly handle -n^m like Prometheus does
`-n^m` must be handled as `-(n^m)` instead of `(-n)^m`.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/675
2020-08-07 07:42:42 +03:00
Aliaksandr Valialkin
7f93d61a56 app/vmselect/promql: remove metric name after applying clamp_min and clamp_max functions in order to be consistent with Prometheus
This improves VictoriaMetrics score at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:42:55 +03:00
Aliaksandr Valialkin
01000505a0 app/vmselect/promql: remove metric name after applying ceil, floor and round functions in order to be more consistent with Prometheus
This improves VictoriaMetrics score at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:34:03 +03:00
Aliaksandr Valialkin
75bff1a567 app/vmselect/promql: remove metric name from results of certain rollup functions in order to be consistent with Prometheus
Rollup functions:

  - avg_over_time
  - min_over_time
  - max_over_time
  - quantile_over_time

This improves VictoriaMetrics results at https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:29:18 +03:00
Aliaksandr Valialkin
8835004a4c app/vmselect: properly handle PromQL queries like scalar1 < metric < scalar2 like Prometheus does
This fixes some cases from https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 23:21:14 +03:00
Aliaksandr Valialkin
3947307363 vendor: update github.com/VictoriaMetrics/metricsql from v0.2.10 to v0.3.0
This adds support for special integers in MetricsQL that start from 0x, 0b, 0o.
This improves compatibility with PromQL - see https://promlabs.com/promql-compliance-test-results-victoriametrics/
2020-08-06 21:45:33 +03:00
Aliaksandr Valialkin
a2039b3bbc app/vmselect: return the upper bound on the number of found time series from storage.Search.Init
This is used by a single-node version in order to reduce memory allocations during search.
See bc8381613d for details.
2020-08-06 19:20:31 +03:00
Aliaksandr Valialkin
b690eeff53 lib/storage: reduce the frequency (and overhead) for timeout and pace limiter checks by 4x 2020-08-06 18:45:47 +03:00
Aliaksandr Valialkin
6c0a92a1ee lib/pacelimiter: increase scalability for multi-CPU system 2020-08-06 18:33:07 +03:00
Aliaksandr Valialkin
14ddb8a34e app/vmselect/netstorage: reduce CPU contention when upacking time series blocks by unpacking batches of such blocks instead of a single block
This should improve query performance on systems with big number of CPU cores (16 and more)
2020-08-06 17:50:13 +03:00
Aliaksandr Valialkin
46c98cd97a app/vmselect/netstorage: reduce contention on unpackworkCh and timeseriesWorkCh for multi-CPU system by providing more capacity for these chans 2020-08-06 17:22:39 +03:00
Aliaksandr Valialkin
13f8644f8e lib/storage: optimize prefetching metric names for the given metricIDs 2020-08-06 16:52:58 +03:00
Aliaksandr Valialkin
a455930ab4 app/vmstorage: rename vm_cache_size_entries{type="storage/prefetchedMetricIDs"} to vm_cache_entries{type="storage/prefetchedMetricIDs"} to be consistent with other vm_cache_entries metrics 2020-08-06 16:34:18 +03:00
Aliaksandr Valialkin
f789e0fa44 lib/fs: export vm_nfs_pending_dirs_to_remove metric for monitoring the number of pending directories that couldn't be removed due to NFS lock 2020-08-06 15:31:50 +03:00
Aliaksandr Valialkin
a3e91c593b lib/storage: limit the number of concurrent calls to storage.searchTSIDs to GOMAXPROCS*2
This should limit the maximum memory usage and reduce CPU trashing on vmstorage
when multiple heavy queries are executed.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
2020-08-05 18:27:21 +03:00
Aliaksandr Valialkin
76064ba9e7 Perform conversion from string to []byte according to rule #6 at https://golang.org/pkg/unsafe/#Pointer 2020-08-05 11:55:12 +03:00
Aliaksandr Valialkin
9038df211f vendor: make vendor-update 2020-08-05 11:15:47 +03:00
Aliaksandr Valialkin
a930460236 app/vmagent: tune http client for sending data to remote storage in order to disable closing keep-alive connections
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/663
2020-08-04 21:01:40 +03:00
Aliaksandr Valialkin
a04f4a3d9a app/vmselect: use warning level instead of info level for logging slow queries that take longer than -search.logSlowQueryDuration 2020-08-04 20:24:38 +03:00
Aliaksandr Valialkin
265206a6c3 docs/Single-server-VictoriaMetrics.md: add a chapter about data updates 2020-08-04 13:54:29 +03:00
Aliaksandr Valialkin
8cc2e01386 lib/backup: allow using ~/.aws/config without region
Use us-west-2 for determining bucket region.
2020-08-04 13:08:05 +03:00
Aliaksandr Valialkin
bdb881c43b app/vmselect/promql: add zscore-related functions: zscore_over_time(m[d]) and zscore(q) by (...) 2020-08-03 21:52:15 +03:00
Aliaksandr Valialkin
94471a1273 app: remove duplicate *-pure makefile rules 2020-07-31 20:01:30 +03:00
Aliaksandr Valialkin
a2aa3a60eb app/vmselect: show X-Forwarded-For contents on /api/v1/status/active_queries page
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/659
2020-07-31 20:01:09 +03:00
Aliaksandr Valialkin
3149af624d lib/storage: reduce the maximum number of concurrent merge workers to GOMAXPROCS/2
Previously the limit has been raised to GOMAXPROCS, but it has been appeared that this
increases query latencies since more CPUs are busy with merges.

While at it, substitute `*MergeConcurrencyLimitCh` channels with simple integer limits.
2020-07-31 17:53:13 +03:00
Aliaksandr Valialkin
106e302d7a all: add mssing APP_NAME to vm*-GOARCH builds 2020-07-31 13:45:32 +03:00
Aliaksandr Valialkin
945645f38f docs/{vmagent,vmalert}: add instruction on how to build for ARM 2020-07-31 09:25:41 +03:00
Aliaksandr Valialkin
a0eec52e6c docs/Single-server-VictoriaMetrics.md: mention that downgrade is also safe to perform 2020-07-31 09:21:03 +03:00
Aliaksandr Valialkin
bbc7583015 vendor: update github.com/valyala/quicktemplate from v1.5.2 to v1.6.0 2020-07-30 23:39:48 +03:00
Aliaksandr Valialkin
0c00fe70cf app/vmselect: do not adjust start and end query args passed to /api/v1/query_range when -search.disableCache command-line flag is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/563
2020-07-30 23:14:56 +03:00
Aliaksandr Valialkin
e9860b2fa3 docs/vmalert.md: sync with app/vmalert/README.md 2020-07-30 21:57:20 +03:00
Aliaksandr Valialkin
29bbab0ec9 lib/storage: remove prioritizing of merging small parts over merging big parts, since it doesn't work as expected
The prioritizing could lead to big merge starvation, which could end up in too big number of parts that must be merged into big parts.

Multiple big merges may be initiated after the migration from v1.39.0 or v1.39.1. It is OK - these merges should be finished soon,
which should return CPU and disk IO usage to normal levels.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/618
2020-07-30 20:02:22 +03:00
Aliaksandr Valialkin
96039dcb40 lib/storage: properly update vm_slow_row_inserts_total metric when importing multiple data points per time series at once
Previously the `vm_slow_row_inserts_total` metric may be incremented multiple times for different data points per a single time series,
while only a single increment is needed when inserting the first data point for this time series.
2020-07-30 16:17:19 +03:00
Aliaksandr Valialkin
bafd475f2c vendor: update github.com/valyala/quicktemplate from v1.5.1 to v1.5.2 2020-07-29 18:20:19 +03:00
Aliaksandr Valialkin
1e067401ba lib/httpserver: emit X-Forwarded-For additionally to remoteAddr in error logs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/659
2020-07-29 13:12:35 +03:00
Aliaksandr Valialkin
338ee47d60 app/vmselect/promql: return non-empty value from rate_over_sum(m[d]) even if a single data point is located in the given [d] window
Just divide the data point value by the window duration in this case.
2020-07-29 12:37:34 +03:00
Aliaksandr Valialkin
717c554fb0 app/vmselect/promql: remove rollupFuncArg.realPrevValue handling, since the corner case in increase() is handled in another way now
See e00cfc854d for the approach used now.
2020-07-29 12:37:34 +03:00
Aliaksandr Valialkin
d9037b3970 app/vmselect/promql: fill gaps with 0 in rate_over_sum response when the last value before the selected time window isnt empty 2020-07-29 12:37:34 +03:00
Aliaksandr Valialkin
f068ea74d0 vendor: make vendor-update 2020-07-29 09:35:34 +03:00
Aliaksandr Valialkin
ab47fa2300 vendor: update github.com/VictoriaMetrics/metrics from v1.12.1 to v1.12.2 2020-07-28 22:01:41 +03:00
Aliaksandr Valialkin
f6d4275087 app/{vmagent,vminsert}: properly preserve db tag from query string passed to Influx line protocol query
Previously `db` tag from the query string wasn't added to metrics after encountering `db` tag in the Influx line

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/653
2020-07-28 21:25:49 +03:00
Aliaksandr Valialkin
baebe86844 app/vmagent/remotewrite: add missing resp.Body.Close() after pushing data to remote storage
Missing body close could disable HTTP keep-alive connections.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/653
2020-07-28 21:00:25 +03:00
Aliaksandr Valialkin
0f6f0d30d3 app/vmselect: show query origin (aka remote_addr or client address) on the /api/v1/status/active_queries page for every query 2020-07-28 15:14:40 +03:00
Roman Khavronenko
ec6ed467c6 app/vmalert: support external.label to specify global labelset for all rules #622 (#652)
`external.label` flag supposed to help to distinguish alert or recording rules
source in situations when more than one `vmalert` runs for the same datasource
or AlertManager.
2020-07-28 14:23:04 +03:00
Aliaksandr Valialkin
9dccedc599 app/vmselect/promql: return empty values from group() if all the time series have no values at the given timestamp
This aligns `group()` behaviour to Prometheus
2020-07-28 13:41:04 +03:00
Aliaksandr Valialkin
312acf7ce9 docs/MetricsQL.md: small fixes in the docs 2020-07-28 13:28:01 +03:00
Aliaksandr Valialkin
91d4673bd6 docs/Single-server-VictoriaMetrics.md: mention that OpenTSDB data ingestion protocol is used by KairosDB 2020-07-28 13:11:36 +03:00
Aliaksandr Valialkin
ce7c8898af vendor: update github.com/VictoriaMetrics/metrics from v1.12.0 to v1.12.1 2020-07-28 00:21:06 +03:00
Sasasu
96bc476e53 lib/storage: metaindexRow use memroy more efficiently (#655)
due to memory align the metaindexRow structure use 64-byte pre object.
this commit changes the order of field, make metaindexRow use 56-byte pre
object.

Signed-off-by: Sasasu <su@sasasu.me>
2020-07-27 23:23:25 +03:00
Aliaksandr Valialkin
f26ef58137 lib/protoparser/prometheus: add a test for cassandra-exporter
Thanks to Seva
2020-07-27 18:37:46 +03:00
Aliaksandr Valialkin
d5057f6d04 app/vmagent/remotewrite: create new request on failure to send a block of data to remote storage
Previously the request body was already consumed before the retry, so this led to the following error:

    http: ContentLength=... with Body length 0
2020-07-27 17:33:05 +03:00
Aliaksandr Valialkin
b191e425b3 app/vmselect/promql: improve further the accuracy of buckets_limit() function
The accuracy is increased by mergin the smallest bucket with the smallest adjacent bucket.
2020-07-26 12:10:56 +03:00
Aliaksandr Valialkin
43871e79c6 app/vmselect/promql: avoid dropping inf bucket in buckets_limit
The `le="inf"` bucket must be preserved in order to maintain the maximum level of accuracy.
2020-07-25 17:00:25 +03:00
Aliaksandr Valialkin
978c1e930e app/vmselect/promql: optimize buckets_limit(k, buckets) for big number of buckets 2020-07-25 13:24:33 +03:00
Aliaksandr Valialkin
cc735da814 deployment/docker/docker-compose.yml: update Grafana version from 7.0.3 to 7.1.1 2020-07-24 18:41:49 +03:00
Aliaksandr Valialkin
51cbf27077 app/vmselect/promql: improve the accuracy of buckets_limit(k, buckets) function
Now it properly merges the bucket with the previous bucket after deletion.
2020-07-24 17:07:30 +03:00
Aliaksandr Valialkin
cf69b1ea6f app/vmselect/promql: add buckets_limit(k, buckets) function, which limits the number of buckets per time series to k
This function works with both Prometheus-style and VictoriaMetrics-style buckets.
The function removes buckets with the lowest values in order to reserve the highest precision.
The function is useful for building heatmaps in Grafana from too big number of buckets.
2020-07-24 16:14:12 +03:00
Aliaksandr Valialkin
45334f61de app/vmselect: fix tests for rate_over_sum 2020-07-24 02:35:09 +03:00
Aliaksandr Valialkin
3526e8768a app/vmselect/promql: typo fix after 3e557c9861 2020-07-24 02:15:23 +03:00
Aliaksandr Valialkin
94cc677b0c lib/storage: slightly reduce code difference between single-node and cluster versions 2020-07-24 01:18:05 +03:00
Aliaksandr Valialkin
8d1721d128 app/vmselect/promql: add rate_over_sum(m[d]) function to MetricsQL, which returns rate over sum of m values over d duration
Something like `sum_over_time(m[d]) / d`, but more accurate.
2020-07-24 01:17:15 +03:00
Aliaksandr Valialkin
88e8bed0c9 app/vmselect/promql: allow setting [d] window smaller than the interval between raw points for avg_over_time
This makes `avg_over_time` behavior consistent with `sum_over_time` and `count_over_time` behaviors.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/636
2020-07-23 22:25:33 +03:00
Aliaksandr Valialkin
fb3d1380ac lib/storage: respect -search.maxQueryDuration when searching for time series in inverted index
Previously the time spent on inverted index search could exceed the configured `-search.maxQueryDuration`.
This commit stops searching in inverted index on query timeout.
2020-07-23 21:22:05 +03:00
Aliaksandr Valialkin
dbf3038637 lib/storage: add more fine-grained pace limiting for search 2020-07-23 19:21:49 +03:00
Aliaksandr Valialkin
16a4b1b20c app/vmselect/netstorage: protect from too smart compiler, which may break memory usage optimization in tmpBlocksFileWrapper.WriteBlocks 2020-07-23 17:57:24 +03:00
Aliaksandr Valialkin
0750d2cec1 app/vminsert: export vm_relabel_metrics_dropped_total metric that shows the number of metrics dropped due to relabeling 2020-07-23 14:58:02 +03:00
Aliaksandr Valialkin
55ed07add1 app/vmselect: typo fix after 0168e21fe32776e2f7f003f88e0e6e490eb2dcb0g 2020-07-23 14:11:15 +03:00
Aliaksandr Valialkin
7aa5b48508 app/vmselect: reduce memory usage when querying big number of time series with long labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/646
2020-07-23 13:48:58 +03:00
Aliaksandr Valialkin
49a0011837 app/vminsert: do not call ApplyRelabeling function if relabeling is disabled
This should reduce CPU usage a bit when `-relabelConfig` isn't set
2020-07-23 13:35:36 +03:00
Aliaksandr Valialkin
c91ccce50c app/vminsert: fix relabeling for metrics ingested via Influx line protocol
Previously the enabled relabeling with `-relabelConfig` command-line flag could result in missing labels
if a single Influx line protocol message contains multiple field values.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/638
2020-07-23 13:25:37 +03:00
Aliaksandr Valialkin
b8303afcd8 lib/storage: improve prioritizing of data ingestion over querying
Prioritize also small merges over big merges.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
2020-07-23 01:40:38 +03:00
Aliaksandr Valialkin
7d0743422b lib/storage: properly calculate global metrics in UpdateStats() 2020-07-23 00:35:31 +03:00
Aliaksandr Valialkin
6afdcf8a20 lib/mergeset: properly calculate global metrics in UpdateStats()
Previously these metrics could be calculated multiple times for multiple mergeset.Table instances.
2020-07-23 00:35:29 +03:00
Aliaksandr Valialkin
23fa44e56e lib/storage: reorder mergeBlockStreams() args in order to make them more consistent 2020-07-22 21:58:25 +03:00
Aliaksandr Valialkin
754eac676d lib/storage: prevent possible race condition when all the goroutines exit Storage.AddRows, before goroutines other goroutines are blocked on searchTSIDsCond inside Storage.searchTSIDs
This condition may occur after the following sequence of events:

1) A goroutine enters the loop body when len(addRowsConcurrencyCh) == cap(addRowsConcurrencyCh) inside Storage.searchTSIDs.
2) All the goroutines return from Storage.AddRows.
3) The goroutine from step 1 blocks on searchTSIDsCond.Wait() inside the loop body.

The goroutine remains blocked until the next call to Storage.AddRows, which calls searchTSIDsCond.Signal().
This may take indefinite time.
2020-07-22 21:52:42 +03:00
Aliaksandr Valialkin
71c3266fca docs/Single-server-VictoriaMetrics.md: mention that it is recommended inspecting logs during troubleshooting 2020-07-22 18:21:06 +03:00
Aliaksandr Valialkin
edbc777e91 vendor: make vendor-update 2020-07-22 16:54:02 +03:00
Aliaksandr Valialkin
20d0c41ac5 app/vmselect/prometheus: support d, w and y suffixes for durations passed to step in /api/v1/query_range like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/641
2020-07-22 16:27:27 +03:00
Aliaksandr Valialkin
bd4299fafe app/vmselect/netstorage: reduce memory allocations when unpacking time series data by using a pool for unpackWork entries
This should slightly reduce load on GC when processing queries that touch big number of time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/646 according to the provided memory profile
2020-07-22 15:04:42 +03:00
Roman Khavronenko
9460bf782e dashboards/victoriametrics: update Concurrent inserts panel #632 (#645)
Panel `Concurrent inserts` was moved to `vminsert` row. Its metrics and description
was updated accordingly to #632 issue.
2020-07-22 12:43:23 +03:00
Aliaksandr Valialkin
a3f48e395e app/vmagent: add -remoteWrite.decimalPlaces command-line flag, which may be used for reducing disk space usage on the remote storage 2020-07-21 21:55:42 +03:00
Aliaksandr Valialkin
67be79a0bc lib/uint64set: optimize adding items to the set via Set.AddMulti 2020-07-21 20:57:05 +03:00
Aliaksandr Valialkin
5bb4fe1ba4 app/vmselect: take into account the time spent in wait queue before query execution as time spent on the query 2020-07-21 19:00:00 +03:00
Aliaksandr Valialkin
0755cb3b50 app/vmselect/promql: skip the first value in time series passed to increase() if it exceeds by more than 10x the delta between the next value and the first value
This should prvent from inflated `increase()` results for time series that start from big initial values.
Such cases may occur when a label value changes in a metric without counter reset.
2020-07-21 17:24:28 +03:00
Aliaksandr Valialkin
71eba8dcf5 app/vmselect: log the total available memory for concurrent requests on not enough memory errors
This should simplify root cause analysis
2020-07-20 19:51:58 +03:00
Aliaksandr Valialkin
3b246aa569 app/vmagent: add -remoteWrite.proxyURL command-line option
This option allows writing data to `-remoteWrite.url` via http, https or socks5 proxy.
This is similar to `proxy_url` option in `remote_write` section of Prometheus.
See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
2020-07-20 19:31:08 +03:00
Aliaksandr Valialkin
8bee3ef91b docs/vmagent.md: sync with app/vmagent/README.md 2020-07-20 17:09:30 +03:00
Roman Khavronenko
8949ec961d app/vmagent: mention grafana dashboard in README (#639) 2020-07-20 17:09:27 +03:00
Aliaksandr Valialkin
77523af9fc vendor: update github.com/VictoriaMetrics/metrics from v1.11.3 to v1.12.0 2020-07-20 16:56:44 +03:00
Aliaksandr Valialkin
86b54f3768 app/vmagent/remotewrite: allow passing empty -remoteWrite.urlRelabelConfig entries 2020-07-20 15:49:13 +03:00
Aliaksandr Valialkin
141e84b5a4 app/vmselect/prometheus: do not return time series with empty list of datapoints from /api/v1/query_range
This matches Prometheus behaviour.

This should fix https://github.com/jacksontj/promxy/issues/329
2020-07-20 15:30:13 +03:00
Aliaksandr Valialkin
4d2011a87d app/vmselect/promql: add mode() aggregate function 2020-07-20 15:30:11 +03:00
Aliaksandr Valialkin
31ef39e8da lib/httpserver: log remote address in error message from httpserver.Errorf
This should improve detection of the root cause of errors.
Thanks to Anant for the idea.
2020-07-20 14:06:29 +03:00
Aliaksandr Valialkin
427fa43ce2 app/vmselect/promql: add mode_over_time(m[d]) function
See https://en.wikipedia.org/wiki/Mode_(statistics) and https://stackoverflow.com/questions/61134078/promql-query-to-return-the-value-from-a-range-vector-which-occurs-maximum-no-of
2020-07-17 18:29:10 +03:00
Aliaksandr Valialkin
eb402a17bd app/vmselect/promql: optimize group(rollup(m)) calculations 2020-07-17 16:47:30 +03:00
Aliaksandr Valialkin
ea8dc85ba8 app/vmselect/promql: check that any() doesn't touch metric name 2020-07-17 16:23:11 +03:00
Aliaksandr Valialkin
b8b13e82e0 deployment/docker: update Go builder from v1.14.5 to v1.14.6
This fixes runtime issues found in Go since v1.14.5. See https://github.com/golang/go/issues?q=milestone%3AGo1.14.6+label%3ACherryPickApproved
2020-07-17 15:21:12 +03:00
Aliaksandr Valialkin
fc8fe38a82 app/vmselect/promql: add group() aggregate function to MetricsQL
This function has been added in Prometheus 2.20. See https://github.com/prometheus/prometheus/pull/7480
2020-07-17 15:17:38 +03:00
Aliaksandr Valialkin
c64914a7e4 app/vmselect/promql: keep all labels for time series from any() call 2020-07-17 15:17:37 +03:00
Roman Khavronenko
21cf6a1ec4 deployment/docker: replace Prometheus with vmagent (#635)
* replace Prometheus with vmagent in docker compose env;
* cluster dashboard: exclude vmagent from job list;
* cluster dasbhoard: reference datasource var instead of datasource name.
2020-07-17 02:18:03 +03:00
Roman Khavronenko
87946dcc53 vmagent: update grafana dashboard (#634)
* reference datasource variable instead of datasource name;
* change unit from `bytes` to `bits/s` for Network panel.
2020-07-17 02:12:20 +03:00
Aliaksandr Valialkin
f9b38f7f2d app/vminsert/influx: properly handle the case when certain labels with empty values are removed by ApplyRelabeling() call
Previously this could lead to `out of range` panic
2020-07-17 00:05:24 +03:00
Aliaksandr Valialkin
14dc426b45 app/vmselect: fix nil pointer dereference panic when unsuccessfully querying vmstorage 2020-07-16 19:15:18 +03:00
Aliaksandr Valialkin
490a42f592 deployment/docker: update Go builder from v1.14.4 to v1.14.5
This should fix the following issues in Go - https://github.com/golang/go/issues?q=milestone%3AGo1.14.5+label%3ACherryPickApproved
2020-07-16 18:55:37 +03:00
Roman Khavronenko
cb4c433260 vmagent: add grafana dashboard (#629)
`vmagent` Grafana dashboard suppose to provide basic observability over multiple
`vmagent` instances. Dashboard is saved in Grafana export format so it can be easily
imported. It was also integrated into docker-compose environment.
2020-07-15 13:58:30 +03:00
Aliaksandr Valialkin
ce381b3868 app/vmalert: consistently use "%w" instead of "%s" in fmt.Errorf when wrapping errors 2020-07-15 13:55:13 +03:00
Aliaksandr Valialkin
e6d96bb0bd docs/vmagent.md: make filtering rules for init container pods less confusing 2020-07-14 20:33:19 +03:00
Aliaksandr Valialkin
74fb0b293d vendor: make vendor-update 2020-07-14 20:30:57 +03:00
Aliaksandr Valialkin
8e7c7a6fbd docs/Single-server-VictoriaMetrics.md: remove Roadmap chapter, since it became outdated 2020-07-14 19:07:06 +03:00
Aliaksandr Valialkin
c2b4b9138d app/vmagent/remotewrite: return proper value from tssRelabelPool.New
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
2020-07-14 14:28:14 +03:00
Aliaksandr Valialkin
3365f6867b docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-07-14 14:19:53 +03:00
Aliaksandr Valialkin
86044f6561 app/{vminsert,vmagent}: add -influxSkipMeasurement command-line flag for using field name as metric name
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/626
2020-07-14 14:18:40 +03:00
Aliaksandr Valialkin
be0ab4fbfe lib/storage: reset MetricName->TSID cache after marking metricIDs as deleted
This is a follow-up commit after 12b16077c4 ,
which didn't reset the `tsidCache` in all the required places.
This could result in indefinite errors like:

    missing metricName by metricID ...; this could be the case after unclean shutdown; deleting the metricID, so it could be re-created next time

Fix this by resetting the cache inside deleteMetricIDs function.
2020-07-14 14:05:19 +03:00
Aliaksandr Valialkin
0e7b2008b2 app/vmselect/prometheus: do not adjust last points in time series with timestamps exceeding the current time
Such timestamps usually mean that the query contains `offset`.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/625
2020-07-14 12:56:21 +03:00
Aliaksandr Valialkin
a4c96d9e6d lib/protoparser: properly update vm_protoparser_rows_read_total{type="promscrape"} metric 2020-07-14 12:15:56 +03:00
Seva Poliakov
a5e713b6e0 add vm_protoparser_rows_read_total metrics to promscrape (#624)
* add vm_protoparser_rows_read_total metrics to promscrape

move vm_protoparser_rows_read_total for promscrape to better place

move vm_protoparser_rows_read_total for promscrape to better place

* remove possibility of infinity loop at prometheus parser
2020-07-14 12:02:25 +03:00
Roman Khavronenko
207e93b50d lib/flagutil: specify additional description for all Array type flags (#620)
Array type flag is now defined as `value` type in flag description when printed.
This change adds additional description to every Array type flag so it would be
clear what exact type is used:
```
  -remoteWrite.urlRelabelConfig array
        Optional path to relabel config for the corresponding -remoteWrite.url
        Supports array of values separated by comma or specified via multiple flags.
```
2020-07-13 22:00:03 +03:00
Roman Khavronenko
605711bde5 lib/persistentqueue: add vm_persistentqueue_bytes_pending metric (#619)
Metric `vm_persistentqueue_bytes_pending` is a gauge that shows current amount
of bytes in persistentqueue flushed on disk as a difference between write and read
offsets. This metric is very similar to `vmagent_remotewrite_pending_data_bytes`
except of accounting for bytes in-memory.
2020-07-13 21:54:54 +03:00
Roman Khavronenko
a02097e657 Extend metric vm_promscrape_targets with status label (#615)
The change to `vm_promscrape_targets` metric suppose to improve observability
for `vmagent` so it will be possible to track how many targets are up or down
for every specific scrape group:
```
vm_promscrape_targets{type="static_configs", status="down"} 1
vm_promscrape_targets{type="static_configs", status="up"} 2
```
2020-07-13 21:54:53 +03:00
Aliaksandr Valialkin
3898cc0285 app/vmselect/prometheus: minimize the diff for the change 1033dc7e2a over 619b0a25c9 2020-07-13 21:41:17 +03:00
faceair
bf39e67ade fix empty response template (#617) 2020-07-13 21:41:15 +03:00
Aliaksandr Valialkin
b6a5c29549 docs/vmagent.md: sync with app/vmagent/README.md 2020-07-13 21:26:00 +03:00
ofen
9ffa688846 Update README.md (#621)
Troubleshooting section updated to help out with duplicate targets detection
2020-07-13 21:25:59 +03:00
Aliaksandr Valialkin
4353ff7ef1 app/vmagent: fix data race when multiple -remoteWrite.urlRelabelConfig options are set
Previously multiple goroutines could access remoteWriteCtx.tss concurrently, which could lead to data race
and improper relabeling. Now each goroutine has its own copy of tss during relabeling.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
2020-07-10 15:17:23 +03:00
Aliaksandr Valialkin
805a90f642 app/vmagent/remotewrite: typo fix in -remoteWrite.showURL help message 2020-07-10 14:07:14 +03:00
Aliaksandr Valialkin
5910207d61 vendor: update github.com/valyala/quicktemplate from v1.5.0 to v1.5.1
This should fix incorrect encoding for json strings with char codes below 0x20

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/613
2020-07-10 12:58:40 +03:00
Aliaksandr Valialkin
5d21c79af9 docs/Single-server-VictoriaMetrics.md: sync with the original README.md 2020-07-10 12:16:16 +03:00
Aliaksandr Valialkin
6373d377ef app/{vminsert,vmagent}: add ability to import data in Prometheus exposition format via /api/v1/import/prometheus 2020-07-10 12:13:28 +03:00
Aliaksandr Valialkin
2012e294d1 properly calculate readCalls 2020-07-10 12:01:05 +03:00
Aliaksandr Valialkin
d449d0a0e1 app/vmselect/promql: add missing tests for ifnot binary operation 2020-07-09 13:24:12 +03:00
Aliaksandr Valialkin
7e706eea13 app/vmselect/promql: refactor implementations for and and unless binary operations, so they are closer to or implementation 2020-07-09 13:06:01 +03:00
Aliaksandr Valialkin
6c1a47b5e0 app/vmselect/promql/active_queries.go: simplify code a bit by inlining getNextActiveQueryID function 2020-07-09 11:18:53 +03:00
Aliaksandr Valialkin
418f0e46cb docs: add a link to the The CMS monitoring infrastructure and applications publication from CERN 2020-07-08 20:16:31 +03:00
Aliaksandr Valialkin
87f8c728bf lib/promscrape: send Accept header similar to Prometheus when scraping targets
This should fix scraping Spring Boot servers, which return incorrect response
unless `Accept: text/plain` request header is set.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/608
2020-07-08 19:50:06 +03:00
Aliaksandr Valialkin
fd4d593c75 vendor: make vendor-update 2020-07-08 19:24:59 +03:00
Aliaksandr Valialkin
cd58e4356d docs/Cluster-VictoriaMetrics.md: mention about api/v1/status/active_queries page
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/528
2020-07-08 19:15:38 +03:00
Aliaksandr Valialkin
fb86071552 app/vmselect: add /api/v1/status/active_queries page with the list of currently running queries
This is a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/575

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/528
2020-07-08 19:09:31 +03:00
DexterZhang
9930ce1fa9 Feat/query list vmselect (#575)
* feat(vmselect): add support for listing current running queries and canceling specific query

* fix(vmselect): change current queries' pid from int64 counter to uuid

* feat(vmselect): add auth to internal operations like `/resetRollupResultCache`, `/query/list` and `/query/kill`. add flag `internalAuthKey` for these auth

* fix(vmselect): add more info to current queries

* review: delete some unnecessary code and use function instead of init

* review: returen *queriesMap in newQueriesMap

* review: delete unused var in struct queriesMap, add comments to exported functions

* review: add return if error occurs

* feat(vmselect): truncate query string in current running query list API since the size of query string might be large;
                use query string's pointer in struct `query` for the same reason;
		add query info API to get full access of query's info;
2020-07-08 19:04:29 +03:00
Aliaksandr Valialkin
7335743d57 lib/storage: limit the maximum concurrency for data ingestion to GOMAXPROCS
Previously the concurrency has been limited to GOMAXPROCS*2. This had little sense,
since every call to Storage.AddRows is bound to CPU, so the maximum ingestion bandwidth
is achieved when the number of concurrent calls to Storage.AddRows is limited to the number of CPUs,
i.e. to GOMAXPROCS.
2020-07-08 17:34:27 +03:00
Roman Khavronenko
929ad74de6 lib/protoparser: fix metric name of unmarshal errors in promremotewrite (#607)
The change fixes the typo in metric name `vm_protoparser_unmarshal_errors` to
respect the naming standard.
2020-07-08 14:19:27 +03:00
Aliaksandr Valialkin
e401b8d527 lib/protoparser/graphite: go fmt 2020-07-08 14:13:06 +03:00
Aliaksandr Valialkin
50ecf09042 lib/protoparser/graphite: add more tests after eb45185eef
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/610
2020-07-08 14:13:03 +03:00
Seva Poliakov
1ae0334e17 Fix graphite minus one timestamp (#609)
* fix graphite -1 timestamp

* format the graphite fix -1 timestamp
2020-07-08 14:13:01 +03:00
Aliaksandr Valialkin
fad008df7e lib/storage: clarify out of retention period error message by mentioning -retentionPeriod command-line flag 2020-07-08 13:54:13 +03:00
Aliaksandr Valialkin
fe58462bef lib/storage: reset MetricName->TSID cache after deleting time series
This should prevent from adding new data points to deleted time series
without the need to check for the deleted time series.

This improves ingestion performance a bit when the `deleted time series ids` aka `dmis` set
contains big number of time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/596

Based on the idea from @n4mine at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/604
2020-07-06 22:01:24 +03:00
Aliaksandr Valialkin
77bb0e6595 lib/fs: clarify description for -fs.disableMmap command-line flag 2020-07-06 14:28:57 +03:00
Aliaksandr Valialkin
0bff96fe4b lib/storage: prioritize data ingestion over heavy queries
Heavy queries could result in the lack of CPU resources for processing the current data ingestion stream.
Prevent this by delaying queries' execution until free resources are available for data ingestion.

Expose `vm_search_delays_total` metric, which may be used in for alerting when there is no enough CPU resources
for data ingestion and/or for executing heavy queries.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2020-07-05 19:44:04 +03:00
Roman Khavronenko
9afd19d375 app/vmalert: add retries to remotewrite (#605)
* app/vmalert: add retries to remotewrite

Remotewrite pkg now does limited number of retries if write request failed.
This suppose to make vmalert state persisting more reliable.

New metrics were added to remotewrite in order to track rows/bytes sent/dropped.

defaultFlushInterval was increased from 1s to 5s for sanity reasons.

* fix

* wip

* wip

* wip

* fix bits alignment bug for 32-bit systems

* fix mistakenly dropped field
2020-07-05 18:47:38 +03:00
Aliaksandr Valialkin
82871fb7a5 app/vmselect/prometheus: small fixes on top of 8bb762124a 2020-07-05 18:17:53 +03:00
faceair
17f175ff5a fix adjust last points avoid influence earlier value (#606) 2020-07-05 18:17:52 +03:00
Aliaksandr Valialkin
6f1d926698 lib/promscrape: use HostClient.DoDeadline instead of HostClient.Do in order to guarantee strict deadline across multiple scrape attempts 2020-07-03 21:33:48 +03:00
Aliaksandr Valialkin
ee03b4ccbd lib/promscrape: prevent from too big deadline misses on scrape retries
The maximum deadline miss duration is reduced to 2x scrape_interval in the worst case.
By default it is limited to scrape_interval configured for the given scrape target.
2020-07-03 20:42:09 +03:00
Aliaksandr Valialkin
dfa83a4a35 lib/promscrape: check for nil error before checking for the returned status code when scraping targets 2020-07-03 18:37:25 +03:00
Ween
d28fb0baf9 [VMAlert] Fix error log when remoteWrite queue size is full (#602)
* Fix Auto metrics relabeled errors

* Finalize auto-genenated  Labels

* Fix Test Errors

* fix error logs when queue is full

Co-authored-by: xinyulong <xinyulong@kuaishou.com>
2020-07-03 16:50:43 +03:00
Aliaksandr Valialkin
8bb3622e9d app/vminsert: prevent from adding and/or selecting labels with empty values
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/600
2020-07-02 23:17:12 +03:00
Aliaksandr Valialkin
6ebac3ab63 app/vminsert: add ability to apply relabeling to all the incoming metrics if -relabelConfig command-line arg points to a file with a list of relabel_config entries
See https://victoriametrics.github.io/#relabeling
2020-07-02 20:36:33 +03:00
Aliaksandr Valialkin
a45856570b all: typo fix: exptected -> expected 2020-07-02 18:06:21 +03:00
Aliaksandr Valialkin
f10e8809c0 app/vmselect: add interpolate function for filling gaps with linearly interpolated values
See https://stackoverflow.com/q/62565021/274937 for details
2020-07-02 14:54:46 +03:00
Aliaksandr Valialkin
2361ad8ab4 lib/promscrape: add ability to set disable_compression and disable_keepalive options in scrape_config section of the config passed to -promscrape.config
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-07-02 14:19:34 +03:00
Aliaksandr Valialkin
0f754bea49 lib/promscrape: add -promscrape.disableKeepAlive command-line flag for disabling http keep-alive connections when scraping targets
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-07-01 02:20:46 +03:00
BigFish
aa26b94f33 fix: spelling mistakes (#594)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-07-01 01:36:40 +03:00
Aliaksandr Valialkin
618bcc818c vendor: make vendor-update 2020-07-01 01:04:05 +03:00
Aliaksandr Valialkin
4cb3e7595c app/vmstorage: add -denyQueriesOutsideRetention command-line flag for denying queries outside the configured retention 2020-07-01 00:58:42 +03:00
Aliaksandr Valialkin
81e3d4305f lib/httpserver: add Unwrap method to ErrorWithStatusCode, so As and Is functions in standard errors package may properly unwrap the error inside ErrorWithStatusCode 2020-07-01 00:53:49 +03:00
Aliaksandr Valialkin
fe77d661b3 all: use errors.As instead of type assertion for detecting net.Error 2020-07-01 00:16:13 +03:00
Aliaksandr Valialkin
0c4e8aeb2b all: use errors.As for inspecting errors that implement httpserver.ErrorWithStatusCode 2020-07-01 00:03:11 +03:00
Aliaksandr Valialkin
d962568e93 all: use %w instead of %s for wrapping errors in fmt.Errorf
This will simplify examining the returned errors such as httpserver.ErrorWithStatusCode .
See https://blog.golang.org/go1.13-errors for details.
2020-06-30 23:33:46 +03:00
Aliaksandr Valialkin
5a43842bd3 lib/promscrape: add missing label sorting for autogenerated metrics
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/592
2020-06-29 22:39:40 +03:00
Ween
b42cf33c4d Fix Auto metrics relabeled errors (#593)
* Fix Auto metrics relabeled errors

* Finalize auto-genenated  Labels

* Fix Test Errors

Co-authored-by: xinyulong <xinyulong@kuaishou.com>
2020-06-29 22:39:39 +03:00
Roman Khavronenko
156c83d112 app/vmalert: support multiple notifier urls (#584) (#590)
* app/vmalert: support multiple notifier urls (#584)

User now can set multiple notifier URLs in the same fashion
as for other vmutils (e.g. vmagent). The same is correct for
TLS setting for every configured URL. Alerts sending is done
in sequential way for respecting the specified URLs order.

* app/vmalert: add basicAuth support for notifier client (#585)

The change adds possibility to set basicAuth creds for notifier
client in the same fasion as for remote write/read and datasource.
2020-06-29 22:21:56 +03:00
Aliaksandr Valialkin
5341596f96 docs/vmalert.md: sync with app/vmalert/README.md 2020-06-29 22:18:30 +03:00
Roman Khavronenko
bbeab70de6 app/vmalert: move flags description and initialization into subpackages
The change adds no new functionality and aims to move flags definitions
to subpackages that are using them. This should improve readability
of the main function.
2020-06-29 22:18:29 +03:00
kreedom
63c36e2e69 app/vmalert: properly set transport for HTTP clients
Fixes issue #586
2020-06-29 22:18:25 +03:00
Aliaksandr Valialkin
2b504f17de docs: update the info that docker images are built on top of alpine image now
A follow-up after the commit ff624c9125
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/522
2020-06-26 13:52:25 +03:00
Aliaksandr Valialkin
9b06c83cd6 vendor: make vendor-update 2020-06-25 23:44:33 +03:00
Aliaksandr Valialkin
b2b17589fa vendor: update github.com/valyala/fastjson from v1.5.1 to v1.5.2 2020-06-25 23:34:24 +03:00
Aliaksandr Valialkin
aad38c8283 lib/promrelabel: properly apply ^ and $ anchors to regex value in Prometheus relabeling rules 2020-06-25 17:19:02 +03:00
Aliaksandr Valialkin
a586b8b6d4 app/vminsert/netstorage: do not re-route every time series to more than two vmstorage nodes when certain vmstorage nodes are temporarily slower than the rest of them
Previously vminsert may spread data for a single time series across all the available vmstorage nodes
when vmstorage nodes couldn't handle the given ingestion rate. This could lead to increased usage
of CPU and memory on every vmstorage node, since every vmstorage node had to register all the time
series seen in the cluster. Now a time series may spread to maximum two vmstorage nodes under heavy load.
Every time series is routed to a single vmstorage node under normal load.
2020-06-25 16:42:37 +03:00
Aliaksandr Valialkin
12b87b2088 app/vmselect/netstorage: reset big result values every 10 seconds instead of after processing every time series
This should reduce GC pressure when processing time series with big number of rows
2020-06-24 19:37:35 +03:00
Aliaksandr Valialkin
d664bde307 deployment/docker/docker-compose.yml: update Prometheus from v1.18.1 to v1.19.1 and Grafana from v7.0.2 to v7.0.3 2020-06-24 18:09:53 +03:00
Aliaksandr Valialkin
2953c0ec76 docs/Cluster-VictoriaMetrics.md: move VictoriaMetrics logo below "Cluster version" heading, since it is heeded for proper navigation at https://victoriametrics.github.io 2020-06-24 12:05:53 +03:00
Aliaksandr Valialkin
8eb2e5384c docs/SampleSizeCalculations.md: updates 2020-06-24 12:05:52 +03:00
Aliaksandr Valialkin
4931b719d7 docs/SampleSizeCalculations.md: add a doc with calculations for the "Lowest sample size" graph at https://victoriametrics.com/ 2020-06-24 12:00:45 +03:00
nicbaz
46c5c0772c vmselect: fix label_replace when mismatch (#579)
As per documentation on `label_replace` function: "If the regular
expression doesn't match then the timeseries is returned unchanged".

Currently this behavior is not enforced, if a regexp on an existing
tag doesn't match then the tag value is copied as-is in the destination
tag. This fix first checks that the regular expression matches the
source tag before applying anything.

Given the current implementation, this fix also changes the behavior
of the **MetricsQL** `label_transform` function which does not
document this behavior at the moment.
2020-06-23 23:54:29 +03:00
Aliaksandr Valialkin
fd7a3d880e lib/fs: go fmt 2020-06-23 23:03:08 +03:00
Aliaksandr Valialkin
08edb90814 lib/fs: fall back to cgo copy for copying the last 4KB of mmaped data
This probably should fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/581
2020-06-23 22:55:56 +03:00
Aliaksandr Valialkin
1eed50b9ca docs/vmalert.md: sync with app/vmalert/README.md 2020-06-23 22:48:25 +03:00
nicbaz
ea2ed4b7e8 vmalert: add support for TLS configuration (#578)
app/vmalert: add support for TLS configuration

Add support for TLS optional configuration in a similar fashion to what
is currently supported in other vmutils such as vmagent. TLS
configuration options are distinct for datasource, remoteRead,
remoteWrite as well as notifier.
2020-06-23 22:47:23 +03:00
Aliaksandr Valialkin
0fdbe5de25 app/vmselect/netstorage: increase concurrency when processing small number of time series with big number of data points per each time series
Previously VictoriaMetrics was processing up to 32 time series in a single goroutine.
This could be slow if each time series contains big number of data points (10M+ or more), since only a single CPU core could be loaded with work,
while other CPU cores were idle. Fix this by launching GOMAXPROCS workers for time series processing.

This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/572
2020-06-23 22:45:57 +03:00
Aliaksandr Valialkin
3a444bb7bb lib/promrelabel: add support for keep_if_equal and drop_if_equal actions to relabel configs
These actions may be useful for filtering out unneeded targets and/or metrics if they contain equal label values.
For example, the following rule would leave the target only if __meta_kubernetes_annotation_prometheus_io_port
equals __meta_kubernetes_pod_container_port_number:

  - action: keep_if_equal
    source_labels: [__meta_kubernetes_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
2020-06-23 17:29:19 +03:00
Aliaksandr Valialkin
de7e585ac8 lib/promscrape: preserve the previously discovered targets on discovery errors per each job_name
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/582
2020-06-23 15:42:46 +03:00
Aliaksandr Valialkin
6d9c5ad422 vendor: update github.com/klauspost/compress from v1.10.9 to v1.10.10 2020-06-23 13:47:00 +03:00
Aliaksandr Valialkin
521c657f8d lib/fs: an attempt to fix SIGBUS error by rounding mmap`ed region to multiple of 4KB pages
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/581
2020-06-23 13:40:20 +03:00
Aliaksandr Valialkin
5fb60dd647 lib/logger: add -loggerErrorsPerSecondLimit for limiting the rate of ERROR messages 2020-06-23 12:42:59 +03:00
Aliaksandr Valialkin
a80e852aab lib/promscrape: retry performing the request to the server for up to 3 times before giving up when it closes keep-alive connections
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/580
2020-06-23 12:34:12 +03:00
Aliaksandr Valialkin
97557c96d5 docs/Single-server-VictoriaMetrics.md: remove -httpListenAddr command-line flag from setting up VictoriaMetrics chapter
This flag is optional and it has good default value - `:8428`, so there is no need in mentioning it at this chapter
2020-06-22 12:45:50 +03:00
kreedom
f227799c87 Support of custom URL path for alert (#560)
app/vmalert: Support custom URL for alerts source

Add flag `external.alert.source` for configuring custom URL
for alert's source. This may be handy to re-point default source
URL to other systems like Grafana.
Updates #517
2020-06-21 16:33:58 +03:00
Aliaksandr Valialkin
70bf8218bb app/vmselect/promql: properly override label values from group_left and group_right lists like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/577
2020-06-21 16:32:27 +03:00
Aliaksandr Valialkin
50aa34bcbe lib/promscrape/discovery/consul: reduce load on Consul when discovering big number of targets by using background caching
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-06-20 18:20:07 +03:00
Aliaksandr Valialkin
62e1908986 lib/promscrape: reduce default value for -promscrape.discovery.concurrency from 500 to 100
This should reduce load on Kubernetes API server and Consul when big number of targets are discovered

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/574
2020-06-20 17:53:48 +03:00
Aliaksandr Valialkin
1f2826bae2 lib/promscrape/discovery/ec2: expose __meta_ec2_ami like the next Prometheus release will do
See b5d61fb66c for details
2020-06-20 17:45:30 +03:00
Aliaksandr Valialkin
2fc2679a3f app/vminsert/netstorage: remove possible race condition when broken connection may be recovered before acquiring storageNode.bcLock 2020-06-20 16:38:08 +03:00
Aliaksandr Valialkin
85036c2b07 docs/Cluster-VictoriaMetrics.md: add high availability chapter 2020-06-20 15:53:07 +03:00
Aliaksandr Valialkin
96e9eed234 docs/Single-server-VictoriaMetrics.md: mention that vmauth could be used for routing user requests to particular VictoriaMetrics instances 2020-06-19 16:17:05 +03:00
Aliaksandr Valialkin
2e5212ab95 docs/Single-server-VictoriaMetrics.md: add a link to features available for enterprise customers 2020-06-19 13:18:07 +03:00
Aliaksandr Valialkin
9409a31c07 docs/vmauth.md: mention that we can provide custom integration with SAML 2020-06-19 13:13:53 +03:00
Aliaksandr Valialkin
4400700832 app/vminsert: properly replicate data for the last RF-1 storage nodes for -replicationFactor=RF
Previously the data for the last `RF-1` storage noes has been incorrectly replicated to the first storage node.
2020-06-19 12:40:22 +03:00
Aliaksandr Valialkin
ca4c9023e3 vendor: make vendor-update 2020-06-19 02:40:36 +03:00
Tristan Su
c254b683fd lib/storage: set big/small merge concurrency (#568)
fixed #567

Co-authored-by: Tristan Su <suqing.sq@alibaba-inc.com>
2020-06-19 02:21:55 +03:00
Aliaksandr Valialkin
2e5b6220a4 lib/promrelabel: allows regex capture groups in target_label like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/569
2020-06-19 02:20:58 +03:00
Aliaksandr Valialkin
4f673a5201 app/vminsert: export metrics for determining ingested rows with dropped or truncated labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/565
2020-06-19 01:12:44 +03:00
Aliaksandr Valialkin
af7db914c2 make docs-sync 2020-06-18 23:55:04 +03:00
Aliaksandr Valialkin
fd1afa5c63 docs/Articles.md: add a link to article https://stas.starikevich.com/posts/raspberry-pi-4-prometheus/ 2020-06-18 23:13:54 +03:00
Aliaksandr Valialkin
6939e36fdd app/vmselect/promql: fill gaps on right side with values from left side of or operator in the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/552
2020-06-18 23:05:23 +03:00
Aliaksandr Valialkin
85c1ccb8b8 app/vminsert/netstorage: add missing return in storageNode.checkHealth on connection failure 2020-06-18 20:51:51 +03:00
Aliaksandr Valialkin
464682f380 app/vminsert/netstorage: periodically check for each -storageNode health, so it could be marked as healthy when it is ready to accept data
This fixes uneven data routing in cluster version when `-replicationFactor` is set to 1 (default value),
i.e. when the replication is disabled.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2020-06-18 20:42:43 +03:00
Aliaksandr Valialkin
5f3a895c23 lib/storage: add key!=".+" filter additionally to negative filter matching empty value such as key!~"|foo"
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2020-06-18 20:05:45 +03:00
Roman Khavronenko
1a01fe2cf2 vmalert-537: allow name duplication for rules within one group. (#559)
Uniqueness of rule is now defined by combination of its name, expression and
labels. The hash of the combination is now used as rule ID and identifies rule within the group.

Set of rules from coreos/kube-prometheus was added for testing purposes to
verify compatibility. The check also showed that `vmalert` doesn't support
`query` template function that was mentioned as limitation in README.
2020-06-18 18:54:35 +03:00
Aliaksandr Valialkin
87151e825e docs/vmbackup.md: mention that backups from single-node and cluster versions are incompatible 2020-06-18 18:54:34 +03:00
Roman Khavronenko
a171f9b03e dashboard: update cluster-version dashboard. (#558)
Fix "Bytes per point" panel query #551.
2020-06-12 22:07:28 +03:00
Aliaksandr Valialkin
cc2225cc49 app/vmselect: fix the error after 936f35920a 2020-06-12 22:00:45 +03:00
Aliaksandr Valialkin
936f35920a app/vmselect/prometheus: allow returning partial response from /api/v1/export if -search.denyPartialResponse=false
This makes `/api/v1/export` behaviour consistent with other `/api/v1/*` handlers.
2020-06-12 21:11:48 +03:00
Aliaksandr Valialkin
35191d8403 docs/vmalert.md: sync with app/vmalert/README.md 2020-06-10 19:37:48 +03:00
Clémence Saussez
0b53e380cf app/vmalert: fix link to testdata (#547)
Fix broken link to vmalert test data
Signed-off-by: Clemence Saussez <clemence@zen.ly>
2020-06-10 19:37:21 +03:00
Aliaksandr Valialkin
c40f29f783 lib/storage: properly match {tag!="|foo"} filters
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/546
2020-06-10 19:34:37 +03:00
Roman Khavronenko
d71b6e6584 vmalert-491: allow to configure concurrent rules execution per group. (#542)
The feature allows to speed up group rules execution by
executing them concurrently.

Change also contains README changes to reflect configuration
details.
2020-06-09 15:22:11 +03:00
Roman Khavronenko
5c049bf4dd vmalert-521: allow to disable rules expression validation. (#536)
This feature may be useful for using `vmalert` with PromQL
compatible datasources like Loki.
2020-06-09 15:19:25 +03:00
Aliaksandr Valialkin
60b8ce47ad vendor: make vendor-update 2020-06-06 00:00:40 +03:00
Aliaksandr Valialkin
356845d716 vendor: update github.com/klauspost/compress from v1.10.7 to v1.10.8 2020-06-05 23:52:08 +03:00
Aliaksandr Valialkin
9f55dea162 lib/httpserver: do not flush and do not close gzip writer if response compression is disabled
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
2020-06-05 21:37:46 +03:00
Aliaksandr Valialkin
c1be462d42 app/vmauth: disable automatic response compression/uncompression, since it may work improperly in some cases
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
2020-06-05 20:14:07 +03:00
Aliaksandr Valialkin
7680b7155d app/vmauth: emit fatal errors instead of panics when incorrect command-line flags are set 2020-06-05 20:14:05 +03:00
Aliaksandr Valialkin
cf91a94daf lib/backup: properly create missing parent directories in fs.CreateFile 2020-06-05 19:28:25 +03:00
Aliaksandr Valialkin
ba1f764b29 lib/fs: optimize queries that read recent samples for big number of time series
Use standard copy() func instead of mmap-aware copy func for reading recently touched mmap-ed data.
This improves read performance by up to 4x.
2020-06-05 19:10:22 +03:00
Aliaksandr Valialkin
2358d9e41d lib/fs: add a benchmark for ReaderAt.MustReadAt 2020-06-05 19:10:21 +03:00
Aliaksandr Valialkin
01719f4949 app/vmstorage/transport: simplify setupTfss in order to prevent the possibility of nil tfs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/534
2020-06-05 13:17:26 +03:00
Aliaksandr Valialkin
e4cef1b678 app/vmstorage: prevent from serving conns from vminsert and vmselect after the server is closed
Previously it was possible that the connection is served after the server is closed if the following
steps are performed:

1) Server accepts new connection.
2) Server.MustClose() is called and successfully finished.
3) Server starts processing the connection accepted at step 1. There could be various crashes
   like in https://github.com/VictoriaMetrics/VictoriaMetrics/issues/534 since the storage may be already closed.

Now the server closes the connection at step 3 without processing it.
2020-06-05 11:55:48 +03:00
Aliaksandr Valialkin
58069f5a6a app/vmalert: print brief usage info for vmalert -help 2020-06-05 10:43:24 +03:00
Aliaksandr Valialkin
3848ea3a4a app/vmauth: print brief usage info for vmauth -help 2020-06-05 10:40:11 +03:00
Aliaksandr Valialkin
8ad8ca350a app/vmagent: print brief usage info for vmagent -help 2020-06-05 10:40:10 +03:00
Aliaksandr Valialkin
8b0d9df51d lib/backup/fsremote: create all the parent directories before creating file in CreateFile 2020-06-05 10:25:28 +03:00
Aliaksandr Valialkin
a1d841b33e docs/Cluster-VictoriaMetrics.md: remove obsolete line 2020-06-04 20:20:33 +03:00
Aliaksandr Valialkin
4ef0b1181c docs/Cluster-VictoriaMetrics.md: update stale info about replication 2020-06-04 20:19:12 +03:00
Aliaksandr Valialkin
d49f0597f5 deployment/docker: update Go builder from v1.14.3 to v1.14.4
This fixes the following issue in Go runtime, which could result in program hang - https://github.com/golang/go/issues/38931
2020-06-04 18:07:43 +03:00
Aliaksandr Valialkin
70fe337e7f docs/Cluster-VictoriaMetrics.md: clarify simultaneous usage of replication and deduplication 2020-06-04 18:00:42 +03:00
Aliaksandr Valialkin
3d0a0b3785 lib/fs: optimize MustGetFreeSpace performance by caching the results for up to 2 seconds 2020-06-04 13:14:04 +03:00
DexterZhang
fa103875a0 feat(vmselect): add tmp block dir size metrics vm_tmp_blocks_files_size_total (#527)
* feat(vmselect): add tmp block dir size metrics `vm_tmp_blocks_files_size_total`

* refactor(vmselect): use free space instead of used space in tmp block file metrics

* fix: add `bytes` suffix to tmp dir free space metric
2020-06-04 13:05:50 +03:00
Vyacheslav Mitrofanov
89a922fb19 allow to use values lower than 10 with the flag -memory.allowedPercent (#531)
Co-authored-by: Vyacheslav Mitrofanov <vmitrofanov@mfms.ru>
2020-06-03 23:40:13 +03:00
Denis
21df9025c9 Update docker-compose.yml (#530)
Update to latest version of Prometheus & Grafana.
2020-06-03 23:38:11 +03:00
Aliaksandr Valialkin
faea804b88 app/vmauth: log when -auth.config is reloaded in SIGHUP 2020-06-03 23:22:20 +03:00
Aliaksandr Valialkin
730e4a719f docs/Single-server-VictoriaMetrics.md: clarify Replication section 2020-06-03 21:33:02 +03:00
Aliaksandr Valialkin
e9b9aa4db4 docs/FAQ.md: add a question about the difference between vmagent and Prometheus 2020-06-03 20:56:59 +03:00
Aliaksandr Valialkin
6637641dd8 docs/Cluster-VictoriaMetrics.md: update Replication and data safety chapter 2020-06-03 20:24:21 +03:00
Aliaksandr Valialkin
79adb2dbc7 docs/Single-server-VictoriaMetrics.md: mention vmagent in high availability section 2020-06-03 20:16:36 +03:00
Aliaksandr Valialkin
304f9499cf lib/bytesutil: prevent from garbage collecting s before returning from ToUnsafeBytes 2020-06-03 00:23:27 +03:00
Aliaksandr Valialkin
91cebdccde vendor: update github.com/klauspost/compress from v1.10.6 to v1.10.7 2020-06-03 00:11:25 +03:00
Aliaksandr Valialkin
1aa0eefd18 docs/Single-server-VictoriaMetrics.md: sync with master 2020-06-02 22:47:04 +03:00
Aliaksandr Valialkin
2961e71217 deployment/docker: use alpine base image for docker images in order to improve debuggability
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/522
2020-06-02 22:41:17 +03:00
Aliaksandr Valialkin
32f930d5e7 docs: update FAQ.md 2020-06-02 19:59:18 +03:00
Aliaksandr Valialkin
560ae3c82b docs/vmalert.md: sync with app/vmalert/README.md via make docs-sync 2020-06-02 19:12:53 +03:00
Aliaksandr Valialkin
2ad84be7a3 Makefile: add make docs-sync command for syncing docs contents 2020-06-02 19:12:49 +03:00
Aliaksandr Valialkin
045b87c662 app/vmalert: fix comment for UpdateWith exported methods 2020-06-01 14:35:03 +03:00
Aliaksandr Valialkin
43b14b9569 app/vminsert/netstorage: free up unused memory in buffer after memory usage spikes 2020-06-01 14:33:35 +03:00
Roman Khavronenko
44c51c627f vmalert: Add recording rules support. (#519)
* vmalert: Add recording rules support.

Recording rules support required additional service refactoring since
it wasn't planned to support them from the very beginning. The list
of changes is following:
* new entity RecordingRule was added for writing results of MetricsQL
expressions into remote storage;
* interface Rule now unites both recording and alerting rules;
* configuration parser was moved to separate package and now performs
more strict validation;
* new endpoint for listing all groups and rules in json format was added;
* evaluation interval may be set to every particular group;

* vmalert: uncomment tests

* vmalert: rm outdated TODO

* vmalert: fix typos in README
2020-06-01 13:53:46 +03:00
Aliaksandr Valialkin
37aa4fe282 app/vmagent: reload -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig on SIGHUP and on /-/reload
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/518
2020-05-30 14:37:02 +03:00
Aliaksandr Valialkin
a646131a33 app/vmagent: log fatal errors instead of panics when improper command-line flags are passed to vmagent 2020-05-30 14:22:38 +03:00
Aliaksandr Valialkin
f41a01332a app/vminsert/netstorage: evenly distribute rerouted rows among all the availalbe storage nodes
Previously such rows were distributed to the original storage node or to the next storage node.
This may result to uneven load among the remaining storage nodes.
2020-05-30 13:51:09 +03:00
Aliaksandr Valialkin
02b2064d8e app/vminsert/netstorage: do not increment vm_rpc_rows_lost_total when all the vmstorage nodes are unavailable, since vminsert retries sending the data instead of dropping it 2020-05-28 22:36:56 +03:00
Aliaksandr Valialkin
6f94fb6842 docs/CaseStudies.md: add a link to articles about VictoriaMetrics 2020-05-28 21:32:58 +03:00
Aliaksandr Valialkin
c6047b6aa0 docs/Articles.md: added a link to https://medium.com/@IG1.com/sismology-iguana-solutions-monitoring-system-f46e4170447f 2020-05-28 20:10:12 +03:00
Aliaksandr Valialkin
7a61357b5d app/vminsert/netstorage: make sure that the the data is always replicated among -replicationFactor vmstorage nodes
Previously vminsert could write multiple copies of the data to a single vmstorage node when the ingestion rate
exceeds the maximum throughput for connections to vmstorage nodes.
2020-05-28 19:59:07 +03:00
Aliaksandr Valialkin
981caa6f0b docs/Cluster-VictoriaMetrics.md: mention that opentsdb/api/put handler is disabled by default 2020-05-28 14:27:17 +03:00
Aliaksandr Valialkin
eca1afdc20 lib/storage: fix Graphite wildcard matching, which has been broken in v1.36.0 2020-05-28 11:58:47 +03:00
Aliaksandr Valialkin
b0131c79b6 lib/storage: improve search speed for time series matching Graphite whildcards such as foo.*.bar.baz
Add index for reverse Graphite-like metric names with dots. Use this index during search for filters
like `__name__=~"foo\\.[^.]*\\.bar\\.baz"` which end with non-empty suffix with dots, i.e. `.bar.baz` in this case.

This change may "hide" historical time series during queries. The workaround is to add `[.]*` to the end of regexp label filter,
i.e. "foo\\.[^.]*\\.bar\\.baz" should be substituted with "foo\\.[^.]*\\.bar\\.baz[.]*".
2020-05-27 21:48:08 +03:00
Aliaksandr Valialkin
fc32881105 vendor: make vendor-update 2020-05-27 18:41:21 +03:00
Aliaksandr Valialkin
b09b5f671e docs/Cluster-VictoriaMetrics.md: mention that nginx can be used as a load balancer in front of vminsert and vmselect 2020-05-27 18:09:39 +03:00
Aliaksandr Valialkin
7bb00cd988 docs: refresh docs about replication support 2020-05-27 17:48:37 +03:00
Aliaksandr Valialkin
77e5165e7b app/vminsert: add -replicationFactor command-line flag for enabling data replication among available -storageNode instances 2020-05-27 17:29:44 +03:00
Aliaksandr Valialkin
b4e3bffe4b app/vminsert/netstorage: emit warnings instead of errors when re-routing data to healthy storage nodes 2020-05-27 16:31:41 +03:00
Aliaksandr Valialkin
75f2f3b09d app/vminsert/netstorage: improve ingestion performance when a single vmstorage node is slower than other vmstorage nodes
Previously the ingestion performance has been limited by the slowest vmstorage node.
Now vminsert should re-route data from the slowest vmstorage node to the remaining nodes.
2020-05-27 15:08:22 +03:00
Aliaksandr Valialkin
9844845d79 app/vminsert: tune the maximum summary buffer size for pending data to 1/4 of available RAM, since 1/2 of RAM is too big considering GOGC overhead 2020-05-25 02:00:37 +03:00
Aliaksandr Valialkin
4a82631e44 app/vminsert: limit the summary buffer sizes for all the storage nodes to a half of the allowed memory 2020-05-25 01:39:33 +03:00
Aliaksandr Valialkin
97feac596f vendor: make vendor-update 2020-05-25 00:06:22 +03:00
Aliaksandr Valialkin
301838e7b1 lib/httpserver: properly set status code for empty response 2020-05-24 23:55:55 +03:00
Aliaksandr Valialkin
64bec11c91 lib/httpserver: fix compression for static files 2020-05-24 22:16:51 +03:00
Aliaksandr Valialkin
99b634e0f9 docs/Single-server-VictoriaMetrics.md: add a video to Zerodha talk about monitoring k8s with VictoriaMetrics 2020-05-24 15:52:13 +03:00
Aliaksandr Valialkin
b747362936 lib/promscrape: mention about -promscrape.maxScrapeSize in the error message when target returns too big response 2020-05-24 14:41:24 +03:00
Aliaksandr Valialkin
fbdce0c6ac docs/Cluster-VictoriaMetrics.md: mention that cluster components may be monitored with vmagent 2020-05-23 14:29:27 +03:00
Aliaksandr Valialkin
319feb4796 docs/CaseStudies.md: add a link to a post about VictoriaMetrics histograms in Zerodha case study 2020-05-23 12:44:32 +03:00
Aliaksandr Valialkin
cc05d0a3b1 docs/CaseStudies.md: add Zerodha case based on monitoring K8s with VictoriaMetrics slides at https://docs.google.com/presentation/d/1g7yUyVEaAp4tPuRy-MZbPXKqJ1z78_5VKuV841aQfsg/edit 2020-05-23 12:41:54 +03:00
Aliaksandr Valialkin
4bd3d4b148 app/vminsert/netstorage: do not return error from storageNode.flushBufLocked when the buffer has been successfully re-routed to healthy nodes
This should reduce the number of false errors in the log and the number of falsely lost rows
2020-05-22 18:29:43 +03:00
Aliaksandr Valialkin
6edc33d9bb app/vminsert/netstorage: capture the first error instead of the last error when sending data to vmstorage
The first error has more chances to point to the real root cause of the issue.
2020-05-22 17:49:33 +03:00
Aliaksandr Valialkin
be7253c084 lib/httpserver: do not recompress already compressed response
This shoud help with vmauth issue - https://github.com/VictoriaMetrics/VictoriaMetrics/issues/514
2020-05-22 16:45:20 +03:00
Aliaksandr Valialkin
bb4a2bf1aa app/vmauth: fix make run-vmauth command 2020-05-22 16:45:19 +03:00
Aliaksandr Valialkin
0794cb35f2 docs/Single-server-VictoriaMetrics.md: mention about vmauth in Security section 2020-05-21 23:48:32 +03:00
Aliaksandr Valialkin
c0933ce926 docs/Cluster-VictoriaMetrics.md: mention about vmauth service in Multitenancy chapter 2020-05-21 22:53:45 +03:00
Aliaksandr Valialkin
3a3ff50548 docs/Single-server-VictoriaMetrics.md: sync with single-node README.md 2020-05-21 20:45:57 +03:00
Aliaksandr Valialkin
dcbdc009f5 app/vmagent: check for error returned from flag.Set 2020-05-21 16:30:48 +03:00
Aliaksandr Valialkin
b59e089ac7 app/vmagent: add -dryRun option for checking all the configs mentioned in command-line flags without running vmagent
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/362
2020-05-21 15:23:18 +03:00
Aliaksandr Valialkin
482bae8466 lib/promscrape: add -promscrape.config.dryRun flag for checking -promscrape.config for errors or unsupported options
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/508
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/362
2020-05-21 14:54:32 +03:00
Aliaksandr Valialkin
901093279e app/vmstorage/transport: update stale comment - vmstorage now sends small ack packets to vminsert 2020-05-21 14:04:52 +03:00
Aliaksandr Valialkin
a5e57a76eb docs/vmagent.md: sync with app/vmagent/README.md 2020-05-21 12:11:22 +03:00
kreedom
2752d6cb26 vmalert add quotes escape function (#510)
* vmalert add quotes escape function

Co-authored-by: kreedom
2020-05-21 12:10:35 +03:00
Aaron France
b26245c48b Update README.md 2020-05-21 12:10:33 +03:00
Aliaksandr Valialkin
d83c68ca03 app/vmselect/promql: add ascent_over_time(m[d]) and descent_over_time(m[d]) functions
These functions could be useful in GPS tracking apps for calculating the summary for height gain/loss
over the given duration `d`.
2020-05-21 12:06:34 +03:00
Aliaksandr Valialkin
8ff28f5b91 app/vmselect/promql: update numbers after the upgrade of github.com/VictoriaMetrics/metrics from v1.11.2 to v1.11.3 2020-05-20 03:07:07 +03:00
Aliaksandr Valialkin
071d58864b vendor: update github.com/VictoriaMetrics/metrics from v1.11.2 to v1.11.3 2020-05-20 02:55:17 +03:00
faceair
504557785e keep debug symbols (#438)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-05-20 01:23:14 +03:00
Aliaksandr Valialkin
ec6fb5a323 docs/MetricsQL.md: add a link to https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085 2020-05-20 00:51:27 +03:00
Roman Khavronenko
00a7eab43d dashboards: update troubleshooting row (#506)
* Slow metrics load panel was removed since it is hard to interpret without
additional metrics and stats;
* Slow inserts panel was updated to display percentage of slow inserts comparing
to total number of inserts to show the real impact.
2020-05-20 00:51:12 +03:00
Aliaksandr Valialkin
ddc9e69bd6 docs/vmagent.md: mention an alternative to refresh_interval option in scrape configs 2020-05-19 23:10:16 +03:00
Aliaksandr Valialkin
73ec5cf460 lib/promscrape: add -promscrape.discovery.concurrency and -promscrape.discovery.concurrentWaitTime flags for tuning the number of concurrent requests to autodiscovery API servers at Consul or Kubernetes 2020-05-19 17:35:59 +03:00
Aliaksandr Valialkin
7d46dd452a app/vmselect/promql: move common code from aggrFuncOutliersK and newAggrFuncRangeTopK into getRangeTopKTimeseries 2020-05-19 16:11:03 +03:00
Aliaksandr Valialkin
37068064dd app/vmselect/promql: fix outilersk calculations 2020-05-19 14:45:10 +03:00
Aliaksandr Valialkin
5e4d08ac22 docs/Quick-Start.md: mention that vmagent can be used instead of Prometheus in most cases 2020-05-19 14:09:16 +03:00
Aliaksandr Valialkin
fc81ea38d4 app/vmselect/promql: add outliersk(N, m) aggregate function for anomaly detection across groups of similar time series 2020-05-19 13:52:44 +03:00
Aliaksandr Valialkin
9ca781b8f0 app/vmalert/notifier: go fmt 2020-05-19 13:00:18 +03:00
Roman Khavronenko
8e29b4a716 dashboards: updates and fixes for cluster version (#500)
* The new update introduces new row "Troubleshooting" that
contains panels for churn rate and slow-queries/inserts/loads metrics. This row supposed to be reveal the cause of low performance or other issues;
* CPU panel got `short` units instead of `seconds`;
* Overview row was updated with panel showing bytes-per-datapoint stat;
* Overview row was updated with panel showing free disk space.
2020-05-19 11:57:20 +03:00
kreedom
27911ae179 vmalert - add expr to variables, add escape functions (#495)
* vmalert - add expr to variables, add escape functions

Co-authored-by: kreedom
2020-05-19 11:55:03 +03:00
Roman Khavronenko
c7f3e58032 vmalert: avoid sending resolves for pending alerts (#498)
Before the change we were sending notifications to notifier
if following conditions are met:
* alert is in Fire state
* alert is in Inactive state

We were sending Inactive notifications to resolve alert ASAP. 
Unfortunately, we were sending resolves for Pending alerts that become
Inactive, which is wrong.

In this change we delete alert from the active list if
it was Pending and become Inactive. In this way we now
have Inactive alerts only if they were in state Fire before.
See test change for example.
2020-05-19 11:55:00 +03:00
Roman Khavronenko
e5f5342e18 vmalert: fix potential race during configuration reloads (#497)
Configuration reload and rules evaluation can't be executed
in same time now. This may make reload time longer but
prevents from potential races.
2020-05-19 11:54:55 +03:00
Aliaksandr Valialkin
c0c6581601 docs/Articles.md: add https://www.robustperception.io/evaluating-performance-and-correctness to third-party posts 2020-05-17 00:35:30 +03:00
Aliaksandr Valialkin
32a1fa9fd3 deployment/docker: update Go builder from v1.14.2 to v1.14.3
This should fix the following issues found in Go v1.14.2.
See https://github.com/golang/go/issues?q=milestone%3AGo1.14.3+label%3ACherryPickApproved for details.
2020-05-16 22:53:09 +03:00
Aliaksandr Valialkin
b99d03a956 app/vmalert: run make quicktemplate-gen from the root dir of the repository 2020-05-16 22:45:45 +03:00
Aliaksandr Valialkin
5fbab64b0f docs/Single-server-VictoriaMetrics.md: put contact us email to the top of the page 2020-05-16 22:37:41 +03:00
Aliaksandr Valialkin
0528d3fed9 docs/Single-server-VictoriaMetrics.md: add Replication and Backups sections 2020-05-16 22:28:16 +03:00
Aliaksandr Valialkin
f3dbcb73ce docs/Cluster-VictoriaMetrics.md: add missing endpoints to the list: api/v1/import/csv and api/v1/status/tsdb 2020-05-16 22:12:58 +03:00
Aliaksandr Valialkin
2784015a4d all: print --help output to stdout instead of stderr
This is easier to grep and pipe
2020-05-16 12:03:06 +03:00
Aliaksandr Valialkin
a5a21739ac docs/Quick-Start.md: update old link to Docker hub to new link 2020-05-16 10:23:40 +03:00
Aliaksandr Valialkin
2a8f1e6931 lib/storage: do not increment vm_slow_metric_name_loads_total counter for metric_ids which shouldnt be prefetched, since this may mislead users 2020-05-16 10:23:39 +03:00
Aliaksandr Valialkin
dc16cdd1ca lib/persistentqueue: a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/484 2020-05-16 09:32:30 +03:00
肖贝贝
c154a92d29 fix: fix vmagent multi queue may become one because sync bug (#484)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-05-16 09:32:29 +03:00
Aliaksandr Valialkin
dbf8048134 app/vmrestore: document better that vmrestore works like rsync --delete, i.e. it deletes files in -storageDataPath, which are missing in the backup 2020-05-16 09:02:46 +03:00
Aliaksandr Valialkin
e544155a82 app/vmagent/Makefile: fix make run-vmagent rule 2020-05-15 19:35:16 +03:00
Aliaksandr Valialkin
6c43ba1cb1 app/vmagent/remotewrite: remove unused import after the commit 93267f143f 2020-05-15 17:42:31 +03:00
Aliaksandr Valialkin
1d71253653 app/vmagent/remotewrite: allow ingesting time series with multiple samples at once
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/481
2020-05-15 17:37:27 +03:00
Aliaksandr Valialkin
0f3d46810b lib/backup: remove misleading -dst mention in error message
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/482
2020-05-15 17:13:27 +03:00
Aliaksandr Valialkin
e72518e8c6 lib/backup: donload only the remaining parts for partially downloaded files after vmrestore restart
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/487
2020-05-15 17:03:25 +03:00
Aliaksandr Valialkin
a853869e75 app/vmstorage/transport: prevent from uncontrolled memory usage growth when vminsert sends big packets with too long labels
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/490
2020-05-15 15:42:54 +03:00
Aliaksandr Valialkin
d1c8b0d6e9 .github/workflows: an attempt to fix loading of golangci-lint 2020-05-15 15:06:42 +03:00
Aliaksandr Valialkin
bdbb5f6cfe vendor: make vendor-update 2020-05-15 15:03:11 +03:00
Aliaksandr Valialkin
1e5c1d7eaa app/vmstorage: add vm_slow_metric_name_loads_total metric, which could be used as an indicator when more RAM is needed for improving query performance 2020-05-15 14:12:24 +03:00
Aliaksandr Valialkin
d6b9a49481 app/vmstorage: add vm_slow_row_inserts_total and vm_slow_per_day_index_inserts_total metrics for determining whether VictoriaMetrics required more RAM for the current number of active time series 2020-05-15 13:46:57 +03:00
Aliaksandr Valialkin
2c4d05db10 docs/vmalert.md: sync with app/vmalert/README.md 2020-05-15 13:26:57 +03:00
Roman Khavronenko
e850bf0eff vmalert: fix the access to rules slice element by wrong index (#486)
During group's update rules deletion was causing slice
mutations while slice index was assumed to be unchanged.
This caused "slice bounds out of range" errors when multiple
rules were deleted sequentially.
2020-05-15 13:26:06 +03:00
hagen1778
d369450f27 vmalert: update README 2020-05-15 13:26:04 +03:00
Aliaksandr Valialkin
a72f18e821 lib/{storage,mergeset}: further tuning of compression levels depending on block size
This should improve performance for querying newly added data, since it can be unpacked faster.
2020-05-15 13:12:28 +03:00
Aliaksandr Valialkin
2cf2e9955b lib/storage: wait for all the goroutines to finish in TestSearch in order to prevent racy behavior on test finish 2020-05-15 12:12:20 +03:00
Aliaksandr Valialkin
67e331ac62 lib/storage: optimize ingestion pefrormance for new time series 2020-05-15 12:12:19 +03:00
Aliaksandr Valialkin
6838fa876c lib/mergeset: tune compression levels in order to improve ingestion performance a bit 2020-05-15 12:12:15 +03:00
Aliaksandr Valialkin
1b5d272e07 lib/storage: reduce indentation in Storage.add 2020-05-14 23:23:56 +03:00
Aliaksandr Valialkin
71d29a8fa1 lib/storage: return the first error instead of the last error, since the first error usually points to the root cause 2020-05-14 23:18:59 +03:00
Aliaksandr Valialkin
3845420a8f lib: extract common code for returning fast unix timestamp into lib/fasttime 2020-05-14 23:06:50 +03:00
Aliaksandr Valialkin
7e831741f9 lib/{storage,mergeset}: return dst on error from unmarshalBlockHeaders, so it could be reused 2020-05-14 15:32:23 +03:00
Aliaksandr Valialkin
2f42b85e0e lib/storage: document that getnerateUniqueMetricID should return dense ids 2020-05-14 14:08:59 +03:00
Aliaksandr Valialkin
f442d81648 lib/{storage,mergeset}: cleanup: remove unused partSearch.indexBlockReuse 2020-05-14 14:03:15 +03:00
Aliaksandr Valialkin
4bc3d284fa docs/vmalert.md: sync with app/vmalert/README.md 2020-05-13 22:57:29 +03:00
Roman Khavronenko
e208e76222 vmalert: check if remoteRead object was initied before calling Restore (#473)
The check for non-nil remoteRead was mistakenly dropped
during refactoring which caused panics when `vmalert`
wasn't configured with `remoteRead` flag.
2020-05-13 22:57:26 +03:00
Roman Khavronenko
1523890742 vmalert: fix flag names and description in README (#475)
Change also adds the recommendation for `remotewrite`
queue error.
2020-05-13 22:57:20 +03:00
肖贝贝
8c3e9adf7f Feat/vmalert add max queue size (#472)
* feat: add remoteWrite.maxQueueSize to reduce queue full
* rename remote(write|read) flags to remote(Write|Read) for the sake of consistency

Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-05-13 22:57:16 +03:00
Aliaksandr Valialkin
bac9a684e8 docs/vmbackup.md: add a link to vmbackuper tool 2020-05-13 22:57:11 +03:00
Aliaksandr Valialkin
f3d9a5b0ec app/vmselect/promql: suppress "SA4006: this value of dstValues is never used" error in golangci-lint 2020-05-13 11:46:05 +03:00
Aliaksandr Valialkin
8bb44a5d09 lib/storage: optimize label matching for regexp ending with literal suffix
For example, `{label=~"foo.*bar.+baz"}` contains literal suffix `baz`,
so it should work faster now.
2020-05-13 11:39:05 +03:00
Aliaksandr Valialkin
3b0f66a227 app/vmagent: fix a bug with improper relabeling when multiple -remoteWrite.urlRelableConfig args are set
This bug could result in incorrect relabeling and metrics' drop.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467
2020-05-12 22:03:45 +03:00
Aliaksandr Valialkin
18a0caee43 app/vmselect/promql: fix any(..) calculations - return all the data points instead of the first one 2020-05-12 20:36:49 +03:00
Aliaksandr Valialkin
3d3f41b961 app/vmstorage/transport: fix panic during server stop on 32-bit arches
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/212
2020-05-12 20:21:40 +03:00
Aliaksandr Valialkin
c9ab6dc532 lib/fs: do not use mmap for 32-bit arches by default, since they cannot map files bigger than 4GB in RAM 2020-05-12 20:21:39 +03:00
Aliaksandr Valialkin
81b8811cf4 app/vmselect/promql: remove -search.maxPointsPerTimeseries command-line flag
Limit the estimated time series count after aggregation with grouping by the number of source time series.
2020-05-12 19:54:44 +03:00
Aliaksandr Valialkin
408ade27a9 app/vmselect/promql: add any(x) by (y) aggregate function, which returns any time series from q for each group y 2020-05-12 19:50:29 +03:00
Aliaksandr Valialkin
21c2982ac8 app/vmselect/promql: support for sum(x) by (y) limit N syntax in order to limit the number of output time series after aggregation 2020-05-12 19:50:12 +03:00
Aliaksandr Valialkin
f341c6fcc4 Revert "app/vmselect: add -search.estimatedSeriesCountAfterAggregation command-line flag for tuning the probability of OOMs or false-positive not enough memory errors"
This reverts commit fbb7986dd2380fce2fc8633b7eda8b67f419e74c.

Reason for revert: this commit has been removed from single-node version
2020-05-12 19:50:08 +03:00
Aliaksandr Valialkin
d54a93fc81 app/vmagent: fix scraping mTLS targets, which has been broken in v1.35.1
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/470
2020-05-12 17:23:43 +03:00
Aliaksandr Valialkin
405cf44aed app/vmagent,lib/promscrape: do not set HostClient.DialDualStack, since it isnt used if HostClient.Dial is set 2020-05-12 15:24:53 +03:00
Aliaksandr Valialkin
da6a84e147 app/vmagent/remotewrite: properly dial TCP6 addresses set via -remoteWrite.url
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/469
2020-05-12 15:24:50 +03:00
Aliaksandr Valialkin
bd5f4e0344 lib/storage: properly initialize part struct before trying to close it on error
This should prevent from nil pointer dereference bug at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/468 .
2020-05-12 14:54:16 +03:00
Aliaksandr Valialkin
cc825c483b vendor: make vendor-update 2020-05-12 14:26:29 +03:00
Aliaksandr Valialkin
ddd8c9d099 deployment/docker: omit http2 support in *-prod binaries
VictoriaMetrics doesn't use http/2.0, so disable it completely.

Use `nethttpomithttp2` tag defined in Go1.14 for this.
See 2566e21f24 for details.
2020-05-12 14:19:33 +03:00
Aliaksandr Valialkin
4e237b4670 app/vminsert/influx: support passing AccountID and ProjectID via plain TCP and UDP
Now `vminsert` accepts AccountID and ProjectID via `VictoriaMetrics_AccountID` and `VictoriaMetrics_ProjectID` tags
when reading Influx line protocol data via plain TCP or UDP (i.e. when `-influxListenAddr` is set).
2020-05-12 13:13:04 +03:00
Aliaksandr Valialkin
f7753b1469 lib/storage: gradually pre-populate per-day inverted index for the next day
This should prevent from CPU usage spikes at 00:00 UTC every day when
inverted index for new day must be quickly created for all the active time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/430
2020-05-12 12:13:32 +03:00
Aliaksandr Valialkin
8c77cb436a lib/storage: typo fixes in error messages: or -> of 2020-05-12 12:12:33 +03:00
Aliaksandr Valialkin
bbf06a4248 lib/storage: speed up matching for common regexps in label filters
The following regexps have been optimized:

* 'foo.+bar'
* 'foo.+bar.+baz'

This should improve performance for matching Graphite-like metrics.
2020-05-11 22:49:01 +03:00
Aliaksandr Valialkin
37254a139a lib/storage: add a benchmark for Graphite-like regexps for metric names 2020-05-11 22:49:00 +03:00
Roman Khavronenko
0157566fdb vmalert: cleanup and restructure of code to improve maintainability (#471)
The change introduces new entity `manager` which replaces
`watchdog`, decouples requestHandler and groups. Manager
supposed to control life cycle of groups, rules and
config reloads.

Groups export an ID method which returns a hash
from filename and group name. ID supposed to be unique
identifier across all loaded groups.

Some tests were added to improve coverage.

Bug with wrong annotation value if $value is used in
 templates after metrics being restored fixed.

Notifier interface was extended to accept context.

New set of metrics was introduced for config reload.
2020-05-11 14:35:55 +03:00
Nikolay Khramchikhin
0e8c345ffb vmalert config reload
added config hot reload for vmalert with sighup and api call
2020-05-11 14:35:50 +03:00
Aliaksandr Valialkin
6ce9f81d16 docs/CaseStudies.md: add CERN case study 2020-05-11 14:35:43 +03:00
Aliaksandr Valialkin
6c88e3523b docs/Single-server-VictoriaMetrics.md: small updates for Monitoring and How to start VictoriaMetrics sections 2020-05-08 20:35:31 +03:00
Aliaksandr Valialkin
6646b380ef docs/vmauth.md: fix a link to docker images 2020-05-08 14:11:10 +03:00
Aliaksandr Valialkin
0362bd220e docs/Articles.md: add a link to CERN article at https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf 2020-05-08 01:25:17 +03:00
Aliaksandr Valialkin
657c3e3fc5 Makefile: suppress false positives for golangci-lint on nil pointer dereference 2020-05-07 19:41:11 +03:00
Aliaksandr Valialkin
28ad350a31 app/vmagent: return 200 from /-/reload endpoint as Prometheus does 2020-05-07 19:29:48 +03:00
Aliaksandr Valialkin
2f28e945b8 lib/httpserver: add -http.shutdownDelay flag for a grace period before http server shutdown
The http server returns 503 non-OK error at `/health` page during grace period,
so load balancers in front of the http server could re-route incoming requests
to other servers.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/463
2020-05-07 15:25:51 +03:00
Aliaksandr Valialkin
3052b479b7 lib/httpserver: reduce typical duration for http server graceful shutdown
Previously the duration for graceful shutdown for http server could take more than a minute
because of imporperly set timeouts in setNetworkTimeout.
Now typical duration for graceful shutdown should be reduced to less than 5 seconds.
2020-05-07 14:16:38 +03:00
Aliaksandr Valialkin
dc04040781 docs/{vmagent,vmauth}: small clarifications in the docs 2020-05-07 12:55:06 +03:00
Aliaksandr Valialkin
2b403d3f42 app/vmauth: prevent from attacks with .. in path for accessing resources outside the configured url_prefix 2020-05-07 12:55:04 +03:00
Aliaksandr Valialkin
c43a265716 lib/flagutil: make errcheck happy by explicitly ignoring Array.Set result in tests 2020-05-06 22:37:28 +03:00
Aliaksandr Valialkin
15e3682b40 lib/flagutil: properly parse quoted flag values for flagutil.Array 2020-05-06 22:28:15 +03:00
Aliaksandr Valialkin
20538a2a5d app/vmagent: allow setting independent auth configs per each configured -remoteWrite.url 2020-05-06 16:52:32 +03:00
Aliaksandr Valialkin
12dbb9e22c app/vmagent: properly set client-side TLS certificates for -remoteWrite.url. Previously they were mistakenly set as server-side 2020-05-06 16:50:37 +03:00
Aliaksandr Valialkin
9f39e618ed lib/promscrape/discovery/gce: discover per-zone instances for gce_sd_config in parallel. This should reduce discovery latency 2020-05-06 15:00:23 +03:00
Aliaksandr Valialkin
8665c2edb1 docs/vmagent.md: small fixes 2020-05-06 14:49:25 +03:00
Aliaksandr Valialkin
8ab5e47b5c lib/promscrape: add Prometheus-compatible DNS-based service discovery aka dns_sd_configs 2020-05-06 00:02:41 +03:00
Aliaksandr Valialkin
42d563934b lib/promscrape: properly connect to TCP6 addresses if -enableTCP6 is set 2020-05-06 00:02:40 +03:00
Aliaksandr Valialkin
21b91599c2 docs/{vmauth,vmagent}: fix ports for profiling 2020-05-05 20:16:09 +03:00
Aliaksandr Valialkin
309700ab8c docs/vmauth.md: mention that we can help creating customized proxy 2020-05-05 12:34:08 +03:00
Aliaksandr Valialkin
20e958789a docs/{vmagent,vmauth}: add Profiling section 2020-05-05 11:45:29 +03:00
Aliaksandr Valialkin
1153f30fee docs: add vmauth.md 2020-05-05 11:17:45 +03:00
Aliaksandr Valialkin
782fb30cd0 app/vmauth: build fixes 2020-05-05 11:03:25 +03:00
Aliaksandr Valialkin
de31d16154 app/vmauth: add initial version of vmauth. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmauth/README.md for details 2020-05-05 10:56:20 +03:00
Aliaksandr Valialkin
61df59b9ea docs/vmagent.md: /targets page doesnt expose infomration about imporperly configured scrape configs now. It is written in error log instead 2020-05-05 10:56:18 +03:00
Aliaksandr Valialkin
1c8e97c8a0 lib/procutil: add NewSighupChan function, which returns a channel, which is triggered on every SIGHUP 2020-05-05 10:56:15 +03:00
Aliaksandr Valialkin
dde92fccc5 docs/vmalert.md: sync with app/vmalert/README.md 2020-05-05 07:51:32 +03:00
Aliaksandr Valialkin
054457d1f4 lib/promscrape: allow explicitly setting empty token via token: "" in consul_sd_config 2020-05-05 07:49:54 +03:00
Aliaksandr Valialkin
fd739808f3 make vendor-update 2020-05-05 00:53:41 +03:00
Roman Khavronenko
abce2b092f app/vmalert: restore alerts state from datasource metrics (#461)
* app/vmalert: restore alerts state from datasource metrics

Vmalert will restore alerts state for rules that have `rule.For` > 0 from previously written timeseries via `remotewrite.url` flag.

* app/vmalert: mention remotewerite and remoteread configuration in README
2020-05-05 00:52:19 +03:00
Aliaksandr Valialkin
89aa6dbf56 lib/promscrape: add Prometheus-compatible service discovery for Consul aka consul_sd_configs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/330
2020-05-04 20:53:06 +03:00
Aliaksandr Valialkin
28e0e8fd88 lib/promauth: properly set up client certificate in tls.Config
Previously the client certificate has been mistakenly set up as a server certificate
2020-05-04 20:53:04 +03:00
Aliaksandr Valialkin
ed91fe1d9b lib/promscrape: move common code for discovery api config map handling into discoveryutils 2020-05-04 20:52:58 +03:00
Aliaksandr Valialkin
c50fd219dc lib/promscrape/discovery/kubernetes/: unify apiConfig creation 2020-05-04 20:52:53 +03:00
Aliaksandr Valialkin
54414fefef vendor: update github.com/valyala/quicktemplate from v1.4.1 to v1.5.0 2020-05-04 01:37:34 +03:00
Aliaksandr Valialkin
6606dff58d docs/Single-server-VictoriaMetrics.md: mention that it is recommended upgrading to the latest release before reporting issues 2020-05-04 00:42:33 +03:00
Aliaksandr Valialkin
e3a4b75e59 docs/Cluster-VictoriaMetrics.md: add Multitenancy chapter 2020-05-03 18:01:15 +03:00
Aliaksandr Valialkin
a5880f17af lib/promscrape: remove debug line left after the commit e4aac6ea40 2020-05-03 17:16:19 +03:00
Aliaksandr Valialkin
1f0e8fdc0d lib/promscrape: fix tests after the commit 658a8742ac
The original commit copies `__address__` label to `instance` label when generating per-target labels as Prometheus does.

See https://www.robustperception.io/life-of-a-label for details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/453
2020-05-03 16:59:29 +03:00
DexterZhang
317688f144 fix(vmagent): different behavior as how prometheus deal with labels. [Issue#453] (#454) 2020-05-03 16:59:28 +03:00
Aliaksandr Valialkin
ab1e6a76bb lib/promscrape: make consistent scrape time offsets across reloads for the same ScrapeURL and Labels
This should make consistent intervals between data points for scrape targets across reloads.
Previously these intervals were random.
2020-05-03 14:31:22 +03:00
Aliaksandr Valialkin
f25416984b lib/promscrape: fix TestGetFileSDScrapeWorkSuccess after 3b234d82e5 2020-05-03 14:31:20 +03:00
Aliaksandr Valialkin
f422203e10 lib/promscrape: reload only modified scrapers on config changes
This should improve scrape stability when big number of targets are scraped and these targets are frequently changed.

Thanks to @xbsura for the idea and initial implementation attempts at the following pull requests:

- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/449
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/458
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/459
- https://github.com/VictoriaMetrics/VictoriaMetrics/pull/460
2020-05-03 12:47:16 +03:00
Aliaksandr Valialkin
8f591b848a docs/MetricsQL.md: document first_over_time and last_over_time functions 2020-05-03 12:47:16 +03:00
Aleksey Shirokih
137e371219 Avoid ugly y-label for rows inserted (#457) 2020-05-02 19:06:37 +01:00
Aliaksandr Valialkin
bbaca16ce8 lib/httpserver: rename http.externalURL to http.pathPrefix and improve help message for this flag
The `http.externalURL` flag name was slightly misleading, so it has been renamed to `http.pathPrefix`.
2020-05-02 13:12:24 +03:00
DexterZhang
a0589f2ca5 feat(httpserver): add http.externalUrl config to http server, it adds prefix to http path automatically (#452) 2020-05-02 13:12:23 +03:00
Aliaksandr Valialkin
8e041f1911 docs/Single-server-VictoriaMetrics.md: hint that \n is a single newline char 2020-05-01 13:42:50 +03:00
Aliaksandr Valialkin
b21b73115a app/vminsert: add /-/reload handler in the same way as for vmagent 2020-04-30 02:18:08 +03:00
Aliaksandr Valialkin
a970705d8e lib/procutil: prevent from app termination on SIGHUP signal, since this signal is frequently used for config reload 2020-04-30 02:18:06 +03:00
DexterZhang
ae215e5538 feat(vmagent): add promscrap config reload suppport via http (#450)
* feat(vmagent): add promscrap config reload suppport via http endpoint `/-/reload`

* fix: typo fix
2020-04-30 02:18:01 +03:00
Aliaksandr Valialkin
d99f48aa48 lib/httpserver: mention that -http.maxGracefulShutdownDuration command-line flag value can be increased on shutdown timeout 2020-04-30 01:37:02 +03:00
Aliaksandr Valialkin
fbfa6aa9f0 docs/Single-server-VictoriaMetrics.md: mention that it is better to increase CPU and RAM per vmselect node in order to achieve higher query performance 2020-04-30 00:53:14 +03:00
Aliaksandr Valialkin
c19f67a248 docs: add vmalert.md 2020-04-29 17:42:16 +03:00
Artem Navoiev
121f7e1d56 Update README.md 2020-04-29 17:41:04 +03:00
Aliaksandr Valialkin
15876c6425 docs/Single-server-VictoriaMetrics.md: update Alerting section 2020-04-29 17:39:56 +03:00
Aliaksandr Valialkin
de5f923476 lib/promscrape: set 30 seconds timeout for discovery api requests
Previously such requests could hang for long time. This could make debugging harder.
2020-04-29 17:29:03 +03:00
Aliaksandr Valialkin
b6d88bac04 vendor: use github.com/VictoriaMetrics/fasthttp instead of github.com/fasthttp/fasthttp
The upstream fasthttp may contain issues like 996610f021 ,
plus a code that isn't used by VictoriaMetrics. So let's use a private copy under our control instead.
2020-04-29 16:43:09 +03:00
Aliaksandr Valialkin
473188f4fd docs/Single-server-VictoriaMetrics.md: mention that basic downsampling could be made with the help of de-duplication 2020-04-28 16:39:06 +03:00
Aliaksandr Valialkin
9ed4951ec8 lib/metricsql: move it to a separate repository - github.com/VictoriaMetrics/metrics 2020-04-28 15:30:06 +03:00
Aliaksandr Valialkin
cd1145e5f4 app/vmselect: add -search.estimatedSeriesCountAfterAggregation command-line flag for tuning the probability of OOMs or false-positive not enough memory errors 2020-04-28 12:51:48 +03:00
Aliaksandr Valialkin
d78ed50edd lib/storage: recover when metricID->metricName entry is missing in the inverted index after unclean shutdown
Newly added index entries can be missing after unclean shutdown, since they didn't flush to persistent storage yet.
Log about this and delete the corresponding metricID, so it could be re-created next time.
2020-04-28 12:01:32 +03:00
Aliaksandr Valialkin
a858b7e393 app/vmalert: added missing comments for public entities 2020-04-28 11:19:48 +03:00
Aliaksandr Valialkin
716bbe79d4 app/vminsert/netstorage: increase timeout for waiting for ack message after sending big data block to vmstorage 2020-04-28 11:19:46 +03:00
Aliaksandr Valialkin
d435029d10 docs/Articles.md: add https://zerodha.tech/blog/infra-monitoring-at-zerodha/ 2020-04-28 02:24:36 +03:00
Aliaksandr Valialkin
53740d0026 lib/promscrape: handle connection reset when targets responds with http redirect 2020-04-28 02:14:32 +03:00
肖贝贝
3e6f29f462 fix: vmagent not follow 301/302 redirect bug (#445)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-04-28 02:14:31 +03:00
Aliaksandr Valialkin
424068f804 lib/promscrape: handle connection reset when targets responds with http redirect 2020-04-28 02:14:26 +03:00
肖贝贝
7d045bf2ca fix: vmagent not follow 301/302 redirect bug (#445)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-04-28 02:14:25 +03:00
Aliaksandr Valialkin
50af16baf2 app/vmalert: fix build 2020-04-28 00:34:01 +03:00
Aliaksandr Valialkin
e3db2c73a6 app/vmalert: sync with master branch 2020-04-28 00:19:42 +03:00
Aliaksandr Valialkin
7644f40763 app/vmalert: include it into the next release 2020-04-28 00:11:41 +03:00
Aliaksandr Valialkin
2aecf7c37c lib/{encoding,decimal}: typo fixes in tests: epxecting->expecting 2020-04-28 00:02:19 +03:00
Aliaksandr Valialkin
806dc73d8a lib/encoding: reduce possibility of failure in TestMarshalInt64ArraySize 2020-04-28 00:02:18 +03:00
Aliaksandr Valialkin
a603a15757 lib/promscrape/discovery/gce: make golangci-lint happy 2020-04-27 19:29:42 +03:00
Aliaksandr Valialkin
86a1d9cb0c lib/promscrape: add initial support for Prometheus-compatible service discovery for Amazon EC2 aka ec2_sd_configs 2020-04-27 19:29:22 +03:00
Aliaksandr Valialkin
1acb6eb25a lib/promscrape/discovery/gce: properly set filter query arg in api url 2020-04-27 16:01:53 +03:00
Aliaksandr Valialkin
0daa37fa02 lib/promscrape/discovery/gce: allow empty project and zone for gce_sd_config 2020-04-27 11:45:45 +03:00
Aliaksandr Valialkin
989d84cf3f app/{vminsert,vmstorage}: wait for ack from vmstorage after each packet sent to it from vminsert
This should protect from possible data loss when `vmstorage` is stopped while the packet is sent from `vminsert`.

This commit switches to new protocol between vminsert and vmstorage, which is incompatible
with the previous protocol. So it is required that both vminsert and vmstorage nodes are updated.
2020-04-27 09:53:26 +03:00
Aliaksandr Valialkin
e933cbac16 lib/storage: postpone reading data from blocks during search
This eliminates the need for storing block data into temporary files on a single-node VictoriaMetrics
during heavy queries, which touch big number of time series over long time ranges.

This improves single-node VM performance on heavy queries by up to 2x.
2020-04-27 08:44:01 +03:00
Aliaksandr Valialkin
23a310cc68 app/vmselect/netstorage: substitute sorting packedTimeseries with the natural order of the fetched blocks
This should minimize the number of disk seeks when reading data from temporary file.
2020-04-26 16:46:17 +03:00
Aliaksandr Valialkin
31861c5b8e lib/promscrape/discovery/gce: allow empty zone arg in gce_sd_config - in this case zones for the given project are automatically discovered 2020-04-26 14:37:38 +03:00
Aliaksandr Valialkin
b16e19c053 lib/storage/dedup.go: go fmt 2020-04-26 14:37:36 +03:00
Aliaksandr Valialkin
a0000c3a6e lib/storage: improve deduplication algorithm
Now it leaves only the first data point on each `-dedup.minScrapeInterval` interval.

Previously it may leave two data points on the interval. This could lead to unexpected results
for `histogram_quantile(phi, sum(rate(buckets)) by (le))` query.
2020-04-26 13:10:18 +03:00
Aliaksandr Valialkin
d9bdda408c docs/{vmbackup,vmrestore}.md: update -help output 2020-04-24 22:44:45 +03:00
Jason Gardner
7a6b2839b4 app/vmbackup: added ability to create and delete snapshots during backup (#428)
* app/vmbackup: added ability to create and delete snapshots during backup

Resolves: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/422

* Add snapshot create and delete url flags

* Fixed errcheck warnings in build
2020-04-24 22:35:50 +03:00
Aliaksandr Valialkin
13b4069c59 lib/storage: postpone label filters matching too many time series instead of giving up with error
This should reduce the frequency of the following errors:

    cannot find tag filter matching less than N time series; either increase -search.maxUniqueTimeseries or use more specific tag filters

    more than N time series found on the time range [...]; either increase -search.maxUniqueTimeseries or shrink the time range
2020-04-24 21:18:52 +03:00
Aliaksandr Valialkin
9b386e594f docs/Single-server-VictoriaMetrics.md: document -search.resetCacheAuthKey 2020-04-24 19:48:13 +03:00
Aliaksandr Valialkin
32b3f959fc app/vmselect: fix description for -search.resetCacheAuthKey 2020-04-24 19:44:35 +03:00
Aliaksandr Valialkin
7c74efd640 lib/promscrape/discovery/gce: make golint happy by ignoring resp.Body.Close() result 2020-04-24 18:13:26 +03:00
Aliaksandr Valialkin
987fcce93d .github/workflows: install dependencies before code checkout
Othwerise dependencies' install mangles go.mod
2020-04-24 17:55:53 +03:00
Aliaksandr Valialkin
069690e3bd lib/promscrape: initial implementation for gce_sd_configs aga Prometheus-compatible service discovery for Google Compute Engine 2020-04-24 17:53:43 +03:00
Aliaksandr Valialkin
cf68c5f66a .github/workflows: enable Go modules when installing dependencies
Disabled Go modules broke golangci-lint build
2020-04-24 17:40:43 +03:00
Aliaksandr Valialkin
c53fd515fe docs/Single-server-VictoriaMetrics.md: mention that -search.maxStalenessInterval can be useful for InfluxDB and TimescaleDB users 2020-04-24 16:23:33 +03:00
Aliaksandr Valialkin
48320cffe0 .github/workflows: install golangci-lint at Dependencies step 2020-04-24 15:37:55 +03:00
Aliaksandr Valialkin
de7887fbf4 .github/workflows: update Go version in actions/setup-go from v1.13 to v1.14 2020-04-24 15:31:12 +03:00
Aliaksandr Valialkin
c66daf1f0a vendor: make vendor-update 2020-04-24 15:28:37 +03:00
Aliaksandr Valialkin
8d76795be5 .github/workflows: use master branch for 'actions/setup-go' and 'actions/checkout' 2020-04-24 14:42:06 +03:00
Aliaksandr Valialkin
de991551f5 lib/promscrape: query /api/v1/namespaces/* for the configured namespaces in kubernetes_sd_config
This should fix authroization issues described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/432
2020-04-24 14:42:02 +03:00
Aliaksandr Valialkin
387a21c96d lib/promscrape: add -promscrape.configCheckInterval command-line flag for automating config checking
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/431
2020-04-23 23:41:26 +03:00
Aliaksandr Valialkin
83e4c8427e lib/promscrape: access Config entries by reference, so they can be compared by addresses 2020-04-23 14:38:29 +03:00
Aliaksandr Valialkin
a5ad19e836 vendor: update google.golang.org/api from v0.21.0 to v0.22.0 2020-04-23 14:30:58 +03:00
Aliaksandr Valialkin
b0f6d3244c vendor: update github.com/aws/aws-sdk-go from v1.30.8 to v1.30.12 2020-04-23 12:36:14 +03:00
Aliaksandr Valialkin
e220f3eeb6 lib/promscrape: move KubernetesSDConfig to lib/promscrape/discovery/kubernetes 2020-04-23 11:34:30 +03:00
Aliaksandr Valialkin
1187494c8f lib/promscrape/discovery/kubernetes: hide role switch logic behind GetLabels function 2020-04-22 22:16:18 +03:00
Aliaksandr Valialkin
f9526809e5 app/vmselect: add /api/v1/status/tsdb page with useful stats for locating root cause for high cardinality issues
See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/268
2020-04-22 22:03:23 +03:00
Aliaksandr Valialkin
36f6935ddd vendor: update github.com/valyala/fastjson from v1.5.0 to v1.5.1 2020-04-21 00:04:14 +03:00
Aliaksandr Valialkin
76c4140da7 vendor: update github.com/valyala/gozstd from v1.6.4 to v1.7.0 2020-04-20 23:04:20 +03:00
Aliaksandr Valialkin
f3e5722257 lib/writeconcurrencylimiter: improve docs for -maxConcurrentInserts command-line flag 2020-04-20 21:03:09 +03:00
Aliaksandr Valialkin
b59f1f1504 app/vmselect: add -search.minStalenessInterval command-line flag for removing gaps on graphs built from time series with irregular duration between samples
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/426
2020-04-20 19:42:41 +03:00
Aliaksandr Valialkin
603d4c9217 app/vmselect: merge -search.maxLookback and -search.maxStalenessInterval flags, since it has been appeared they have identical purpose :(
Leave both flags for backwards compatibility reasons.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/209
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/426
2020-04-20 19:28:28 +03:00
Aliaksandr Valialkin
82b2524f28 deployment/docker/docker-compose.yml: bump Prometheus from v2.17.1 to v2.17.2 and Grafana from v6.7.1 to v6.7.2 2020-04-20 17:30:22 +03:00
Aliaksandr Valialkin
81481abaa9 lib/promscrape/discovery/kubernetes: reuse a client for empty api_server inside different jobs 2020-04-20 17:07:37 +03:00
Aliaksandr Valialkin
d5b38eeac4 docs/Single-server-VictoriaMetrics.md: mention about vmagent in the end of Prometheus setup section 2020-04-20 16:42:50 +03:00
Aliaksandr Valialkin
db5fe03170 deployment/docker: allow building docker images on top of any base image set via ROOT_IMAGE environment var
For example, the following command will build VictoriaMetrics docker image on top of alpine image:

    ROOT_IMAGE=alpine make package-victoria-metrics
2020-04-20 01:16:21 +03:00
Aliaksandr Valialkin
e6277165af deployment/docker/base: remove unused group and passwd files 2020-04-19 23:31:44 +03:00
Aliaksandr Valialkin
57311d748d Makefile: increase the timeout for make golangci-lint from 1 minute to 2 minutes
This should fix timeout errors on GitHub actions
2020-04-17 19:13:57 +03:00
Aliaksandr Valialkin
1b911f6965 app/vmagent/remotewrite: retry sending data if the server closes keep-alive connection
This should fix the following error when sending data to remote storage:

couldn't send a block with size XX bytes to "YYY": the server closed connection before returning the first response byte. Make sure the server returns 'Connection: close' response header before closing the connection
2020-04-17 15:53:17 +03:00
Aliaksandr Valialkin
6764efde39 lib/promscrape/discovery/kubernetes: update stale comments 2020-04-17 14:06:26 +03:00
Aliaksandr Valialkin
da05904638 vendor: make vendor-update 2020-04-17 13:25:18 +03:00
Aliaksandr Valialkin
9105f72f17 docs/vmagent.md: typo fix: unvailable -> unavailable 2020-04-17 13:12:13 +03:00
Aliaksandr Valialkin
d46311fd93 app/vmagent/README.md: mention about prodmscrape.suppressScrapeErrors 2020-04-17 13:09:08 +03:00
Aliaksandr Valialkin
b9b5641c2f app/vmselect: properly apply -search.maxLookback to queries sent to /api/v1/query 2020-04-17 12:31:18 +03:00
Dmitry Shihovtsev
41bb31ecf6 Fix misspelled Cortex name in the FAQ (#421) 2020-04-17 12:31:15 +03:00
Aliaksandr Valialkin
d86640d609 lib/promscrape: suppress scrape errors if -promscrape.suppressScrapeErrors flag is set 2020-04-16 23:41:52 +03:00
Aliaksandr Valialkin
70104f3fb1 lib/promscrape: print all the labels for the target on error message for failed scrape
This should improve debuggability.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/420
2020-04-16 23:35:10 +03:00
Aliaksandr Valialkin
266bbec52d lib/promscrape: retry target scraping when the target closes previously established keep-alive connection to it
This should fix the following error:

the server closed connection before returning the first response byte. Make sure the server returns 'Connection: close' response header before closing the connection
2020-04-16 23:25:34 +03:00
Aliaksandr Valialkin
e2c3e1d2e5 docs/Single-server-VictoriaMetrics.md: mention that VictoriaMetrics supports Kubernetes service discovery 2020-04-16 18:40:51 +03:00
Aliaksandr Valialkin
a22a2e9bf4 docs/Single-server-VictoriaMetrics.md: typo fix: unneded -> unneeded 2020-04-16 17:35:47 +03:00
Aliaksandr Valialkin
71c122a814 docs/Single-server-VictoriaMetrics.md: imrpove docs about metrics deletion 2020-04-16 17:32:44 +03:00
Aliaksandr Valialkin
30baf65aa7 docs/Single-server-VictoriaMetrics.md: mention that the delete API can be protected by authKey 2020-04-16 17:20:25 +03:00
Aliaksandr Valialkin
b2d009c8db lib/logger: typo fix 2020-04-16 00:20:02 +03:00
Aliaksandr Valialkin
d4bc60d63c lib/logger: add WARN level for logging expected errors such as invalid user queries 2020-04-15 20:50:45 +03:00
Aliaksandr Valialkin
d23a8b7462 docs/Single-server-VictoriaMetrics.md: typo fix 2020-04-15 15:22:36 +03:00
Aliaksandr Valialkin
9fd1827824 vendor: make vendor-update 2020-04-15 14:51:51 +03:00
Aliaksandr Valialkin
1d4afde6a9 docs/Single-server-VictoriaMetrics.md: clarify how to use -influxListenAddr command-line option 2020-04-15 12:34:32 +03:00
Aliaksandr Valialkin
a873b553cf app/vmselect: handle timestamp(metric offset X) the same way as Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/415
2020-04-15 12:01:05 +03:00
Aliaksandr Valialkin
99f0cb1f5f lib/promscrape: code cleanup in runScraper func 2020-04-15 11:36:35 +03:00
Aliaksandr Valialkin
90bd92a6f7 docs/Single-server-VictoriaMetrics.md: mention that backfilling can be done via any supported ingestion method 2020-04-15 10:57:26 +03:00
Aliaksandr Valialkin
e9d9638627 lib/storage: skip metricID if the corresponding metricID->metricName is missing in inverted index during search
This case is possible when the corresponding metricID->metricName entry didn't propagate to inverted index yet.

This should fix the following error:

error when searching tsids for tfss [...]: cannot find metricName by metricID 1582417212213420669: EOF
2020-04-15 00:10:11 +03:00
Aliaksandr Valialkin
2ce78c0dde docs/Single-server-VictoriaMetrics.md: add https://github.com/Slapper/ansible-victoriametrics-cluster-role to integrations chapter 2020-04-14 16:28:12 +03:00
Aliaksandr Valialkin
6ec582acb9 lib/promscrape: show information on improperly configured scrape targets at the bottom of /targets page
This is a common error whith improperly configured target autodiscovery and/or relabeling.
This error leads to duplicate scraping of the same targets with the same set of labels, which leads
to duplicate samples in time series.
2020-04-14 14:55:13 +03:00
Aliaksandr Valialkin
391fb0903e lib/promscrape/discovery/kubernetes: remove only unused client for API server during cleaning 2020-04-14 14:19:26 +03:00
Aliaksandr Valialkin
636e1578de lib/promscrape: add promrelabel.GetLabelValueByName helper function 2020-04-14 14:12:15 +03:00
Aliaksandr Valialkin
3945bf9dec lib/promscrape: mention job name in error messages when target cannot be scraped
This should improve debuggability
2020-04-14 13:33:18 +03:00
Aliaksandr Valialkin
66da177fe9 lib/promscrape: reset ScrapeWork.ID in tests 2020-04-14 13:31:37 +03:00
Aliaksandr Valialkin
88366cad15 lib/promscrape: properly expose statuses for targets with duplicate scrape urls at /targets page
Previously targets with duplicate scrape urls were merged into a single line on the page.
Now each target with duplicate scrape url is displayed on a separate line.
2020-04-14 13:10:06 +03:00
Aliaksandr Valialkin
09f796e2ab lib/promscrape: remove labels starting with __meta_ after applying relabel_configs as Prometheus does
This should reduce CPU load during scraping when target discovery generates
big number of `__meta_*` labels (for instance, k8s discovery).

See https://www.robustperception.io/life-of-a-label for details.
2020-04-14 12:23:30 +03:00
Aliaksandr Valialkin
f58d15f27c lib/promscrape: rename 'scrape_config->scrape_limit' to 'scrape_config->sample_limit'
`scrape_config` block from Prometheus config contains `sample_limit` field,
while in `vmagent` this field was mistakenly named as `scrape_limit`.
2020-04-14 12:00:03 +03:00
Aliaksandr Valialkin
755f649c72 docs/vmagent.md: mention that vmagent supports kubernetes_sd_configs now 2020-04-13 21:07:00 +03:00
Aliaksandr Valialkin
7c4fb038e3 lib/promscrape: add initial support for kubernetes_sd_config
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/334
2020-04-13 21:03:53 +03:00
Aliaksandr Valialkin
4017163393 lib/promscrape: add -promscrape.config.strictParse flag for detecting errors in -promscrape.config file 2020-04-13 13:15:52 +03:00
Aliaksandr Valialkin
7fbfef2aee lib/promscrape: extract common auth code to lib/promauth 2020-04-13 12:59:22 +03:00
Aliaksandr Valialkin
1ce6c311dd vendor: make vendor-update 2020-04-10 18:42:02 +03:00
Aliaksandr Valialkin
e12c97f0b7 vendor: update github.com/klauspost/compress from v1.10.3 to v1.10.4 2020-04-10 18:38:50 +03:00
Aliaksandr Valialkin
3f417ce4d8 deployment/docker: update Go builder image from go1.14.1 to go1.14.2 2020-04-10 18:19:54 +03:00
Aliaksandr Valialkin
e0c6da8e2a lib/storage: disable deduplication after dedup tests are complete
The rest of tests expect that the de-duplication is disabled.
2020-04-10 17:33:38 +03:00
Aliaksandr Valialkin
8ed0d5471a lib/storage: correctly handle -dedup.minScrapeInterval values smaller than 8ms
Such small values may be used for removing samples with duplicate timestamps.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/409 for details.
2020-04-10 16:40:41 +03:00
Aliaksandr Valialkin
0b2f678d8e lib/{storage,mergeset}: make sure that requests and misses cache counters never go down 2020-04-10 14:44:52 +03:00
Aliaksandr Valialkin
661cfb03e2 lib/protoparser: add -*TrimTimstamp command-line flags for Influx, Graphite, OpenTSDB and CSV data
These flags can be used for reducing disk space usage for timestamps data ingested over the given protocols
2020-04-10 12:44:46 +03:00
Aliaksandr Valialkin
f0b08dbd9e lib/workingsetcache: accumulate stat counters on cache rotation
This should prevent from cache stats counters going down after cache rotation,
which may corrupt `cache hit ratio` graph on the official Grafan dasbhoards
when using the following query:

    1 - (sum(rate(vm_cache_misses_total[5m])) by (type) / sum(rate(vm_cache_requests_total[5m])) by (type))
2020-04-10 11:51:47 +03:00
Aliaksandr Valialkin
28c65b58a2 lib/memory: add more details to -memory.allowedPercent help message 2020-04-09 15:34:21 +03:00
Aliaksandr Valialkin
38256bd66d docs: update minimum supported Go version from 1.12 to 1.13 2020-04-07 13:39:15 +03:00
Aliaksandr Valialkin
f5121d1e5f docs/CaseStudies.md: updated ARNES numbers 2020-04-06 16:20:44 +03:00
Aliaksandr Valialkin
65ba430632 docs/CaseStudies.md: prettifying of the formatting 2020-04-06 15:24:57 +03:00
Aliaksandr Valialkin
d278e8e1b6 docs/CaseStudies.md: add ARNES case study 2020-04-06 15:18:31 +03:00
Aliaksandr Valialkin
5f679a0f24 docs/Single-server-VictoriaMetrics.md: cosmetic fixes in Importing CSV data chapter 2020-04-06 12:30:16 +03:00
Aliaksandr Valialkin
4661fa5b34 docs/FAQ.md: small fixes 2020-04-05 13:52:55 +03:00
Aliaksandr Valialkin
0452cb21ee docs/FAQ.md: add more articles about VictoriaMetrics performance 2020-04-05 13:47:38 +03:00
Aliaksandr Valialkin
3656d0b13a docs/Articles.md: added a link to https://www.iunera.com/kraken/fabric/time-series-database/ 2020-04-04 16:40:41 +03:00
Aliaksandr Valialkin
2b4d3effad app/vmagent/remotewrite: add "X-Prometheus-Remote-Write-Version: 0.1.0" http header to remote_write request
This header is required by Cortex (and, probably, other remote storage systems).
See 9c1f44d090/docs/apis.md (remote-api) .

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/399
2020-04-04 16:24:47 +03:00
Aliaksandr Valialkin
87da127fbf app/victoria-metrics: remove accidentally added testdata for single-node VM 2020-04-04 16:09:08 +03:00
Aliaksandr Valialkin
a012f6fe70 app/vmselect/promql: keep metric name after applying first_over_time and last_over_time functions 2020-04-04 14:54:02 +03:00
Aliaksandr Valialkin
a53e332a93 app/vmstorage: add missing shutdown for http server on graceful shutdown
This could result in the following panic during graceful shutdown when `/metrics` page is requested:

http: panic serving 10.101.66.5:57366: runtime error: invalid memory address or nil pointer dereference
goroutine 2050 [running]:
net/http.(*conn).serve.func1(0xc00ef22000)
	net/http/server.go:1772 +0x139
panic(0xa0fc00, 0xe91d80)
	runtime/panic.go:973 +0x3e3
github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache.(*Cache).UpdateStats(0x0, 0xc0000516c8)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache/cache.go:224 +0x37
github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*indexDB).UpdateMetrics(0xc00b931d00, 0xc02c41acf8)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/storage/index_db.go:258 +0x9f
github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*Storage).UpdateMetrics(0xc0000bc7e0, 0xc02c41ac00)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/storage/storage.go:413 +0x4c5
main.registerStorageMetrics.func1(0x0)
	github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:186 +0xd9
main.registerStorageMetrics.func3(0xc00008c380)
	github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:196 +0x26
main.registerStorageMetrics.func7(0xc)
	github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:211 +0x26
github.com/VictoriaMetrics/metrics.(*Gauge).marshalTo(0xc000010148, 0xaa407d, 0x20, 0xb50d60, 0xc005319890)
	github.com/VictoriaMetrics/metrics@v1.11.2/gauge.go:38 +0x3f
github.com/VictoriaMetrics/metrics.(*Set).WritePrometheus(0xc000084300, 0x7fd56809c940, 0xc005319860)
	github.com/VictoriaMetrics/metrics@v1.11.2/set.go:51 +0x1e1
github.com/VictoriaMetrics/metrics.WritePrometheus(0x7fd56809c940, 0xc005319860, 0xa16f01)
	github.com/VictoriaMetrics/metrics@v1.11.2/metrics.go:42 +0x41
github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.writePrometheusMetrics(0x7fd56809c940, 0xc005319860)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/metrics.go:16 +0x44
github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.handlerWrapper(0xb5a120, 0xc005319860, 0xc005018f00, 0xc00002cc90)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/httpserver.go:154 +0x58d
github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.gzipHandler.func1(0xb5a120, 0xc005319860, 0xc005018f00)
	github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/httpserver.go:119 +0x8e
net/http.HandlerFunc.ServeHTTP(0xc00002d110, 0xb5a660, 0xc0044141c0, 0xc005018f00)
	net/http/server.go:2012 +0x44
net/http.serverHandler.ServeHTTP(0xc004414000, 0xb5a660, 0xc0044141c0, 0xc005018f00)
	net/http/server.go:2807 +0xa3
net/http.(*conn).serve(0xc00ef22000, 0xb5bf60, 0xc010532080)
	net/http/server.go:1895 +0x86c
created by net/http.(*Server).Serve
	net/http/server.go:2933 +0x35c
2020-04-02 21:09:55 +03:00
Aliaksandr Valialkin
bf43ad1d4f docs/Articles.md: move Percona article to third-party 2020-04-02 15:43:19 +03:00
Aliaksandr Valialkin
b6ff251884 docs/Articles.md: add a link to https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/ 2020-04-02 15:41:30 +03:00
Aliaksandr Valialkin
cfea171930 docs/CaseStudies.md: add Adsterra case 2020-04-02 00:49:49 +03:00
Aliaksandr Valialkin
3b744f3c32 app/vmstorage: typo fix 2020-04-01 23:43:09 +03:00
Aliaksandr Valialkin
f838cdc86e app/vmstorage: add vm_free_disk_space_bytes metric for monitoring the remaining disk space at -storageDataPath 2020-04-01 23:10:44 +03:00
Aliaksandr Valialkin
8a02e01210 docs/Single-server-VictoriaMetrics.md: re-organize chapters 2020-04-01 22:41:00 +03:00
Aliaksandr Valialkin
84fa146792 lib/httpserver: remove unnecessary http.HandlerFunc wrapper in gzipHandler 2020-04-01 18:14:47 +03:00
Aliaksandr Valialkin
b3cb188c59 docs/Single-server-VictoriaMetrics.md: small fixes and updates 2020-04-01 18:11:10 +03:00
Aliaksandr Valialkin
d5180dbe78 docs/Cluster-VictoriaMetrics.md: small fixes and updates 2020-04-01 18:06:01 +03:00
Aliaksandr Valialkin
120d452002 docs/Cluster-VictoriaMetrics.md: swap production build and development build chapters 2020-04-01 17:48:43 +03:00
Aliaksandr Valialkin
0ad7aaf535 lib/storage: add missing reset for tagFilter.matchesEmptyValue on tagFilter.Init 2020-04-01 17:40:27 +03:00
Aliaksandr Valialkin
c189104be7 lib/promscrape: reduce timestamp jitter when scraping targets
This should improve compression for timestamps
2020-04-01 16:13:01 +03:00
Aliaksandr Valialkin
4c56acbafa lib/storage: remove duplicate data points on 7/8*minScrapeInterval interval instead of 1/2*minScrapeInterval
This should reduce storage usage and should improve deduplication accuracy
2020-04-01 15:47:04 +03:00
Aliaksandr Valialkin
29d5fbfcd8 docs/Single-server-VictoriaMetrics.md: mention that environment vars may be prefixed with -envflag.prefix 2020-03-31 22:38:26 +03:00
Aliaksandr Valialkin
5792f7296a README.md: mention that response cache must be reset after import historical data 2020-03-31 19:34:06 +03:00
Aliaksandr Valialkin
504ea876f2 lib/storage: handle errors returned from TagFilters.Add when cloning TagFilters with negative filter 2020-03-31 16:18:34 +03:00
Aliaksandr Valialkin
5270b7a097 app/victoria-metrics/testdata: add a test for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/395 2020-03-31 12:51:46 +03:00
Aliaksandr Valialkin
ef714e01c1 lib/storage: add fast path for the previous indexdb search if it doesn't contain per-day inverted index yet 2020-03-31 12:35:15 +03:00
Aliaksandr Valialkin
7e755b4bac lib/storage: optimize per-day inverted index search for tag filters matching big number of time series
- Sort tag filters in the ascending number of matching time series
  in order to apply the most specific filters first.
- Fall back to metricName search for filters matching big number of time series
  (usually this are negative filters or regexp filters).
2020-03-31 00:53:29 +03:00
Aliaksandr Valialkin
d450249955 lib/storage: properly handle {label=~"foo|"} filters as Prometheus does
Such filters must match all the time series with `label="foo"` plus all the time series without `label`

Previously only time series with `label="foo"` were matched.
2020-03-30 20:21:47 +03:00
Aliaksandr Valialkin
b47444e69d lib/envflag: add -envflag.prefix for setting optional prefix for environment vars 2020-03-30 15:51:44 +03:00
Aliaksandr Valialkin
e6e321f542 vendor: make vendor-update 2020-03-30 15:08:00 +03:00
Aliaksandr Valialkin
f4c3a71139 go.mod: update the minimum required Go version from go1.12 to go1.13 2020-03-30 14:57:59 +03:00
Aliaksandr Valialkin
c6cbc0bd19 app/vmselect/prometheus: allow passing relative time to start, end and time args of /api/v1/* queries 2020-03-29 21:56:52 +03:00
Aliaksandr Valialkin
cb8696699a app/vmselect/prometheus: code simplification: (d.Seconds()/1e3) -> d.Milliseconds() 2020-03-29 21:50:35 +03:00
kreedom
f058efb3d1 [vmalert] config parser (#393)
* [vmalert] config parser

* make linter be happy

* fix test

* fix sprintf add test for rule validation
2020-03-29 01:49:40 +02:00
Aliaksandr Valialkin
c66a13bf0f docs: add robots.txt 2020-03-28 23:23:20 +02:00
Aliaksandr Valialkin
ceb6d1459f docs/vmagent.md: add prometheus remote_write proxy use case 2020-03-28 23:17:41 +02:00
Aliaksandr Valialkin
8d55af4e75 docs/CaseStudies.md: add Brandwatch case study 2020-03-28 20:58:46 +02:00
Aliaksandr Valialkin
253844b74c deployment/docker: run docker apps under default user (0, root) in order to preserve backwards compatibility
If docker app is upgraded from root to non-root, then the data pointed by `-storageDataPath` or similar flags
becomes denied to non-root user after the upgrade. This breaks upgrade path. So revert back to default root user
for docker apps.

Users may explicitly execute `docker run --user <non_root_user>` for running docker apps under non-root user.
2020-03-28 19:23:13 +02:00
Roman Khavronenko
a2767fe86f bump Prometheus and Grafana images (#389) 2020-03-28 01:19:19 +02:00
Roman Khavronenko
9373a62f8a Update dashboard according to new Grafana version and some metric renames. (#392)
The list of changes is following:
* fix Uptime panel column styles according to changes introduced in 6.7 Grafana version
* fix panel `vminsert/Rows per insert` due to metric rename - see #336
* change default datasource to VictoriaMetrics since dashboard now uses MetricsQL for `vminsert/Rows per insert` panel
2020-03-28 01:17:38 +02:00
Dmitry Naumov
b84071fc25 Rootless docker images by default (#358)
* Rootless docker images by default

* Migrate to rootless base image

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-03-27 21:18:32 +02:00
Aliaksandr Valialkin
b803bcca6b vendor: make vendor-update 2020-03-27 20:17:26 +02:00
Aliaksandr Valialkin
42c290ce9f lib/httpserver: add -http.maxGracefulShutdownDuration command-line flag for tuning the maximum duration required for graceful shutdown of http server 2020-03-27 20:10:05 +02:00
Aliaksandr Valialkin
8fa80a2dbc lib/uint64set: remove zero buckets after Set.Intersect 2020-03-27 01:16:34 +02:00
Aliaksandr Valialkin
7a35447031 lib/uint64set: small code cleanup and perf tuning
* Remember the last accessed bucket on Has() call.
* Inline fast paths inside Add() and Has() calls.
* Remove fragile code with maxUnsortedBuckets inside bucket32.
2020-03-25 15:29:59 +02:00
Aliaksandr Valialkin
19d93e1a2e deployment/docker: update Go builder from Go1.14.0 to Go1.14.1 2020-03-24 22:35:46 +02:00
Aliaksandr Valialkin
cce936de5b lib/uint64set: go fmt 2020-03-24 22:28:09 +02:00
Aliaksandr Valialkin
7cdac6634c lib/storage: serialize snapshot creation process with mutex
This guarantees that the snapshot contains all the recently added data
from inmemory buffers when multiple concurrent calls to Storage.CreateSnapshot are performed.
2020-03-24 22:27:28 +02:00
Aliaksandr Valialkin
c31b956355 lib/uint64set: added more tests 2020-03-24 22:27:26 +02:00
Aliaksandr Valialkin
8fa9066b98 docs/CaseStudies.md: added a case study from MHI Vestas Offshore Wind 2020-03-14 13:22:41 +02:00
Aliaksandr Valialkin
31a533656e lib/storage: remove obsolete code 2020-03-13 22:42:42 +02:00
Aliaksandr Valialkin
58cb7fc476 app/vmselect: adjust label_map() handling for corner cases
The following corner cases now supported:
* label_map(q, "label", "", "foo") - adds `label="foo"` to series with missing `label`
* label_map(q, "label", "foo", "") - removes `label="foo"` from series

All the unmatched labels are kept unchanged.
2020-03-13 18:41:52 +02:00
Aliaksandr Valialkin
5c5a30734e vendor: update github.com/VictoriaMetrics/metrics from v1.11.0 to v1.11.2
This fixes data race in Histogram
2020-03-13 12:40:15 +02:00
Aliaksandr Valialkin
bf1869d33d lib/promscrape: allow overriding external_labels as Prometheus does
Prometheus docs at https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config say:

> In communication with external systems, they are always applied only
> when a time series does not have a given label yet and are ignored otherwise.

Though this may result in consistency chaos when scrape targets override `external_labels`,
let's stick with Prometheus behavior for the sake of backwards compatibility.

There is last resort in vmagent with `-remoteWrite.label`, which consistently
sets the configured labels to all the metrics before sending them to remote storage.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/366
2020-03-12 20:25:27 +02:00
Aliaksandr Valialkin
0e7a71a245 app/vmselect: add label_map(q, label, srcValue1, dstValue1, ... srcValueN, dstValueN) function to MetricsQL
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/369
2020-03-12 19:13:56 +02:00
Aliaksandr Valialkin
39a977b5aa vendor: update google.golang.org/genproto from fc8f55426688 to da6875a35672 2020-03-12 18:12:46 +02:00
Aliaksandr Valialkin
9ef7bba17b vendor: update golang.org/x/tools from 26f6a1b6802d to 5e2df02acb1e 2020-03-12 18:08:02 +02:00
Aliaksandr Valialkin
d91790543f vendor: update github.com/aws/aws-sdk-go from v1.29.10 to v1.29.22 2020-03-12 17:55:11 +02:00
Aliaksandr Valialkin
fa4d70b428 vendor: update google.golang.org/api from v0.19.0 to v0.20.0 2020-03-12 17:52:02 +02:00
Aliaksandr Valialkin
83fe650ca1 vendor: update golang.org/x/sys from d5e6a3e2c0ae to 5c8b2ff67527 2020-03-12 17:46:39 +02:00
Aliaksandr Valialkin
e5c073a9a1 vendor: update github.com/klauspost/compress from v1.10.1 to v1.10.3 2020-03-12 17:33:11 +02:00
Aliaksandr Valialkin
fa7910fba1 lib/protoparser/csvimport: add missing metric vm_rows_invalid_total{type="csvimport"} 2020-03-12 15:28:10 +02:00
Aliaksandr Valialkin
41b532046f README.md: mention about alternative dashboard for cluster version - https://grafana.com/grafana/dashboards/11831 2020-03-12 15:11:55 +02:00
Aliaksandr Valialkin
50555d89d3 app/vmselect: add -search.maxStalenessInterval for tuning Prometheus data model closer to Influx-style data model 2020-03-11 16:44:03 +02:00
Aliaksandr Valialkin
375d5483fa lib/promscrape: remove possible races when registering and de-registering scrape workers for /targets page 2020-03-11 16:30:43 +02:00
Aliaksandr Valialkin
b46af9678e app/vmagent: mention that vmagent can filter data 2020-03-11 16:23:10 +02:00
Aliaksandr Valialkin
803f919c75 docs/Articles.md: add a link to https://stas.starikevich.com/posts/disk-usage-for-vm-versus-prometheus/ 2020-03-11 04:56:37 +02:00
Aliaksandr Valialkin
187fd89c70 lib/promscrape: consistently update /targets page after SIGHUP 2020-03-11 03:20:38 +02:00
Aliaksandr Valialkin
8939c19281 app/vmstorage: return 500 status code instead of 200 status code on internal errors inside /snapshot/* handlers 2020-03-10 23:54:27 +02:00
Aliaksandr Valialkin
b51e548b64 docs/vmagent.md: sync with app/vmagent/README.md 2020-03-10 21:54:35 +02:00
Aliaksandr Valialkin
f6410ff2bf app/vmselect: add optional max_rows_per_line query arg to /api/v1/export
This arg allows limiting the number of data points that may be exported on a single line.
2020-03-10 21:47:43 +02:00
Aliaksandr Valialkin
2f0a36044c app/{vmagent,vminsert}: add support for importing csv data via /api/v1/import/csv 2020-03-10 21:17:40 +02:00
Aliaksandr Valialkin
7545784a49 all: fix golangci-lint issues 2020-03-10 19:40:03 +02:00
Aliaksandr Valialkin
8a2ea0171a docs/FAQ.md: actualize answer about deduplication 2020-03-09 13:37:39 +02:00
Aliaksandr Valialkin
d70c9b9556 docs: add missing vmagent.png, which is used in vmagent.md 2020-03-09 13:37:38 +02:00
Aliaksandr Valialkin
3fc6599aa2 app/vmagent: properly apply -remoteWrite.sendTimeout to fasthttp.HostClient 2020-03-09 13:31:22 +02:00
Aliaksandr Valialkin
d39dd8aa69 lib/promscrape: do not retry idempotent requests when scraping targets
This should prevent from the following unexpected side-effects of idempotent request retries:
- increased actual timeout when scraping the target comparing to the configured scrape_timeout
- increased load on the target

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/357
2020-03-09 13:31:20 +02:00
Aliaksandr Valialkin
12789a4621 app/vmagent: do not allow non-supported fields in -remoteWrite.relabelConfig and file_sd_configs
This should reduce possible confusion like in the https://github.com/VictoriaMetrics/VictoriaMetrics/issues/363
2020-03-06 20:19:34 +02:00
Aliaksandr Valialkin
47e986c26f app/vmagent: properly add labels set via -remoteWrite.label to metrics before sending them to -remoteWrite.url 2020-03-06 19:28:14 +02:00
Aliaksandr Valialkin
0d893eff36 Makefile: add build and test rules with enabled race detector. These rules have -race suffix
Fix also `unsafe pointer conversion` errors detected by Go1.14. See https://golang.org/doc/go1.14#compiler .
2020-03-05 12:05:16 +02:00
Aliaksandr Valialkin
197d2916ab README.md: add a link to https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Articles 2020-03-04 20:06:22 +02:00
Aliaksandr Valialkin
7909964cf3 docs/Articles.md: add a link to https://www.percona.com/blog/2020/02/28/better-prometheus-rate-function-with-victoriametrics/ 2020-03-04 20:06:21 +02:00
Aliaksandr Valialkin
0176fc4206 app/vmagent/README.md: small fixes 2020-03-04 18:15:24 +02:00
Aliaksandr Valialkin
ac03be5a2c app/vmagent/README.md: typo fix 2020-03-04 18:05:43 +02:00
Aliaksandr Valialkin
9354b9177a app/vmagent/README.md: clarification 2020-03-04 18:04:06 +02:00
Aliaksandr Valialkin
4302555228 app/vmagent/README.md: add iot and edge monitoring use case 2020-03-04 18:01:40 +02:00
Aliaksandr Valialkin
ea5904fd76 app/vmagent/README.md: add use cases section 2020-03-04 17:42:57 +02:00
Aliaksandr Valialkin
ed355fe6b4 docs/CaseStudies: add Synthesio 2020-03-04 17:12:18 +02:00
Aliaksandr Valialkin
50190263c8 docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-03-03 21:39:31 +02:00
Aliaksandr Valialkin
31a76a7b3a lib/promscrape: consistency renaming: stopCh -> globalStopCh 2020-03-03 20:08:22 +02:00
Aliaksandr Valialkin
f01d1bf4a8 app/vmagent: add -remoteWrite.maxDiskUsagePerURL for limiting the maximum disk usage for each -remoteWrite.url buffer
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/352
2020-03-03 19:49:20 +02:00
Aliaksandr Valialkin
808c17e250 app/vmagent/remotewrite: do not reset empty relabelCtx 2020-03-03 15:01:21 +02:00
Aliaksandr Valialkin
af19ca2483 app/vmagent: add -remoteWrite.urlRelabelConfig for applying individual relabeling for each -remoteWrite.url
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/320
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/308
2020-03-03 13:13:06 +02:00
Aliaksandr Valialkin
c3b239eb1a lib/protoparser/prometheus: allow trailing comma in tags list
The trailing comma is generated by cloudwatch exporter.
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/350
2020-03-02 22:23:28 +02:00
Aliaksandr Valialkin
d23df53ba2 app/vmselect/prometheus: do not add __name__!= filter when searching for all the matching metric names via /api/v1/label/__name__/values with non-empty label filter
This should reduce query time.
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/343
2020-02-28 23:36:38 +02:00
Aliaksandr Valialkin
6a1aab88fd lib/protoparser: metrics renaming: vm_protoparser_<type>_* -> vm_protoparser_*{type="<type>"}
This should improve composability of these metrics in PromQL queries
2020-02-28 20:19:59 +02:00
Aliaksandr Valialkin
0eed71c7f4 app/vmagent/remotewrite: yet another typo fix 2020-02-28 20:07:00 +02:00
Aliaksandr Valialkin
c2e602286c lib/persistentqueue: reset chunk file when the persistent queue is empty 2020-02-28 20:06:59 +02:00
Aliaksandr Valialkin
6cdc97a53f app/vmagent/remotewrite: typo fix 2020-02-28 19:05:11 +02:00
Aliaksandr Valialkin
cc39c9d74b app/vmagent/remotewrite: limit memory usage when big scrape blocks are pushed to remote storage 2020-02-28 18:58:13 +02:00
Aliaksandr Valialkin
6282b29a44 docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-02-28 12:59:02 +02:00
Aliaksandr Valialkin
45d21d18a8 docs: add a doc for vmagent 2020-02-28 12:23:44 +02:00
Aliaksandr Valialkin
8fa1cd24d8 app/vmselect/prometheus: properly pass filter for labelName=__name__ in labelValuesWithMatches
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/343
2020-02-28 12:17:30 +02:00
Aliaksandr Valialkin
cf9aee4ec3 all: properly split vm_deduplicated_samples_total among cluster components
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/345
2020-02-27 23:47:51 +02:00
Aliaksandr Valialkin
5e7b4795bd lib/envflag: typo fix in docs to -envflag.enable: envoronment->environment 2020-02-27 21:56:46 +02:00
Aliaksandr Valialkin
52fe4e68fb deployment/docker: update Go builder from Go1.13.8 to Go1.14.0 2020-02-26 22:14:43 +02:00
Aliaksandr Valialkin
1286cead75 app/vminsert: properly initialize InsertCtx
This should prevent from panic described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/339
2020-02-26 21:21:02 +02:00
Aliaksandr Valialkin
0597f1e39a app/vmagent: allow setting -httpListenAddr to empty string in order to disable listening for http requests
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/340
2020-02-26 20:58:26 +02:00
Aliaksandr Valialkin
8c2d396e8a make vendor-update 2020-02-26 20:46:24 +02:00
Aliaksandr Valialkin
a6c0d490a3 vendor: update github.com/VictoriaMetrics/metrics from v1.10.1 to v1.11.0 2020-02-26 20:40:34 +02:00
Aliaksandr Valialkin
266101feb4 app/vmagent/README.md: list service discovery mechanisms, which will be added soon
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/334
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/330
2020-02-26 19:28:19 +02:00
Aliaksandr Valialkin
e6a481ab11 lib/promscrape: properly reload new configs on SIGHUP 2020-02-26 13:54:24 +02:00
Aliaksandr Valialkin
fa6815712f lib/promscrape: go fmt 2020-02-26 13:24:40 +02:00
Edouard Hur
fed37ecfcb add envvars details (#337) 2020-02-26 13:23:06 +02:00
Aliaksandr Valialkin
f2a6948a14 lib/promscrape: do not add missing port to __address__ label in order to be consistent with Prometheus behavior
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/331
2020-02-25 20:50:18 +02:00
Aliaksandr Valialkin
c6c7843e93 app/vmagent: add -remoteWrite.maxBlockSize command-line flag for limiting the maximum size of unpacked block to send to remote storage 2020-02-25 19:58:11 +02:00
Aliaksandr Valialkin
c4194020ef app/vmagent: do not allow sending unpacked requests with sizes exceeding -maxInsertRequestSize 2020-02-25 19:35:43 +02:00
Aliaksandr Valialkin
2471340e0d app/vmagent: add ability to accept Influx line protocol data via TCP and UDP
Just set `-influxListenAddr` command-line flag

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/333
2020-02-25 19:18:01 +02:00
Aliaksandr Valialkin
f96fb93ca5 app/vmagent: add a counter for /targets handler calls 2020-02-25 18:17:25 +02:00
Aliaksandr Valialkin
25c570dae7 app/vmagent/README.md: mention that vmagent exposes target statuses at /targets page 2020-02-25 18:16:08 +02:00
Aliaksandr Valialkin
7a045125cc lib/fs: typo fix: read blocks bigger than 8KB via pread() call instead of using mmap 2020-02-25 18:04:06 +02:00
Aliaksandr Valialkin
ca28a3e805 app/vmagent: logo fix 2020-02-25 00:09:55 +02:00
Aliaksandr Valialkin
777a39f7a1 app/vmagent: update docs 2020-02-25 00:09:53 +02:00
Aliaksandr Valialkin
61e67b8922 app/vmagent/README.md: small fixes 2020-02-24 21:26:12 +02:00
Aliaksandr Valialkin
13ee8271d0 lib/envflag: substitute dots with underscores in env var names if -envflag.enable is set
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/311
2020-02-24 21:15:11 +02:00
Aliaksandr Valialkin
6ca1e58d98 app/vmselect/promql: properly take into account the first datapoint when calculating rollup_candlestick 2020-02-24 13:25:07 +02:00
Aliaksandr Valialkin
b58e3fc8a9 app/vmselect/promql: do not take into account values outside the current window in rollup_candlestick
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/309
2020-02-23 18:06:26 +02:00
Yaroslav
c69d4b01f0 fix rollupOpen(), rollupHigh(), rollupLow() functions (#328) 2020-02-23 18:06:25 +02:00
Aliaksandr Valialkin
7ee7614e90 app/vmagent: initial implementation for vmagent 2020-02-23 17:31:54 +02:00
Aliaksandr Valialkin
ab1e66d31f vendor: update github.com/valyala/fastjson from v1.4.5 to v1.5.0 2020-02-23 10:06:48 +02:00
Aliaksandr Valialkin
f22aefdb16 app/vmselect/promql: log when rollupResult cache is cleared 2020-02-21 20:06:53 +02:00
Aliaksandr Valialkin
110cce24d9 lib/storage: add vm_ prefix to deduplicated_samples_total metric 2020-02-21 19:33:36 +02:00
Aliaksandr Valialkin
d5c2a0ce64 app/vmselect: add -search.cacheTimestampOffset command-line flag
This flag can be used for removing gaps on graphs if the difference between the current time
and the timestamps from the ingested data exceeds 5 minutes.

This is the case when the time between data sources and VictoriaMetrics is improperly synchronized.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/312
2020-02-21 14:02:15 +02:00
Aliaksandr Valialkin
c70822db50 app/vmselect: add /internl/resetRollupResultCache handler for resetting response cache
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/312
2020-02-21 14:02:12 +02:00
Aliaksandr Valialkin
51abc84932 deployment/docker: update Go builder from v1.13.7 to v1.13.8 2020-02-20 19:56:36 +02:00
Aliaksandr Valialkin
9d279e26a7 lib/protoparser/prometheus: skip leading whitespace from tag names 2020-02-16 19:06:23 +02:00
Aliaksandr Valialkin
fb5848f536 Makefile: add missing vmbackup and vmrestore to all and all-pure targets 2020-02-16 16:55:34 +02:00
Aliaksandr Valialkin
d687e5518d vendor: make vendor-udpate 2020-02-16 16:12:28 +02:00
Aliaksandr Valialkin
a2b81b71b9 lib/storage: typo fix 2020-02-16 15:53:48 +02:00
Aliaksandr Valialkin
ad4cb9f3ca lib/storage: prevent from clobbering nin-nil lastError in Storage.add 2020-02-16 15:51:35 +02:00
Aliaksandr Valialkin
afecb34491 app/vmstorage: limit the maximum error message size before sending it to client
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/315
2020-02-13 17:33:12 +02:00
Aliaksandr Valialkin
846d7fa7e9 app/vmselect: add sort_by_label(q, label) and sort_by_label_desc(q, label) functions
This is implementation of https://github.com/prometheus/prometheus/pull/1533 for VictoriaMetrics.
2020-02-13 17:01:50 +02:00
Aliaksandr Valialkin
e3b18ca1ab lib/mergeset: skip createing temporary part objects when merging source inmemory parts
This should reduce CPU usage when adding new entries to inverted index.
This should alos prevent from creating stalled cleaner goroutines for the created temporary parts,
since they were never closed.

This should fix the following issue: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/316 .
2020-02-13 14:09:13 +02:00
Aliaksandr Valialkin
347aaba79d lib/{storage,mergeset}: use time.Ticker instead of time.Timer where appropriate
It has been appeared that time.Timer was used in places where time.Ticker must be used instead.
This could result in blocked goroutines as in the https://github.com/VictoriaMetrics/VictoriaMetrics/issues/316 .
2020-02-13 13:21:48 +02:00
Aliaksandr Valialkin
6e0013ca39 app/vmselect/prometheus: typo fix in -latencyOffset description 2020-02-12 14:00:38 +02:00
Aliaksandr Valialkin
22ede83146 make vendor-update 2020-02-10 23:35:14 +02:00
Aliaksandr Valialkin
ebf7785d79 vendor: update github.com/VictoriaMetrics/metrics from v1.9.3 to v1.10.1 2020-02-10 23:07:29 +02:00
Aliaksandr Valialkin
e7d1037210 docs: migrate ExtendedPromQL->MetricsQL in order to be more consistent 2020-02-10 23:03:31 +02:00
Edouard Hur
e8f92a4ee8 Cluster - prometheus metrics fix (#314)
* add missing '/{}' in prom query range requests

* fix missing leading '/' on prom lavelValuesErrors path
2020-02-10 22:15:21 +02:00
Aliaksandr Valialkin
fcdd95a6ef lib/envflag: check for incorrect flag values read from environment vars 2020-02-10 16:09:03 +02:00
Aliaksandr Valialkin
9c5db9400c lib/envflag: add -envflag.enable command-line flag for enabling reading flags from environment vars
By default flags are read only from command line. They can be read from environment vars if `-envflag.enable` is set.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/311
2020-02-10 16:09:01 +02:00
Aliaksandr Valialkin
1010a57882 all: allow setting flags via environment vars
Now flags can be set via environment vars with the same names as flags.
Command-line flags override flags set via env vars.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/311
2020-02-10 13:31:21 +02:00
Aliaksandr Valialkin
ea66212c93 lib/storage: move -dedup.minScrapeInterval flag outside lib/storage, so it doesnt show up in vminsert in cluster version 2020-02-10 13:07:25 +02:00
Aliaksandr Valialkin
07c067697e docs/Single-server-VictoriaMetrics.md: sync with README.md 2020-02-07 00:03:05 +02:00
Aliaksandr Valialkin
e6d9ea3094 app/vmselect/promql: do not add step to range end, since this hack became obsolete since commit 9e1119dab8 2020-02-05 21:23:44 +02:00
Aliaksandr Valialkin
4a1de7fee9 app/vmselect/promql: properly adjust time range for data to select
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/309
2020-02-05 21:23:43 +02:00
Aliaksandr Valialkin
8e77b54846 app/vmselect: unconditionally offset -step to rollup_candlestick. This makes results more consistent 2020-02-04 23:31:47 +02:00
Aliaksandr Valialkin
ce38b176bc app/vmselect/promql: automatically apply offset -step to rollup_candlestick function in order to obtain the expected OHLC results
See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/309
2020-02-04 23:24:04 +02:00
Aliaksandr Valialkin
4f7116d1ee app/vmselect/promql: adjust rollup_candlestick calculations to the exepcted results
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/309
2020-02-04 22:43:28 +02:00
Aliaksandr Valialkin
8b360a25e9 lib/logger: initialize output to os.Stderr by default 2020-02-04 22:43:26 +02:00
Aliaksandr Valialkin
c931a540f4 Do not require checking for errors returned from fmt.Fprint
This fixes `make errcheck` error found in lib/logger
2020-02-04 22:03:52 +02:00
Aliaksandr Valialkin
1f271a9815 lib/logger: add -loggerOutput command-line flag
This flag allows changing log output from `stderr` to `stdout` if `-loggerOutput=stdout` is set.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/306
2020-02-04 21:48:24 +02:00
Aliaksandr Valialkin
49ab3fa076 lib/logger: do not clutter -loggerFormat=json output with stack trace
This should improve json parsing
2020-02-04 21:40:20 +02:00
Aliaksandr Valialkin
56d6b8ed0a lib/storage: do not deduplicate blocks with less than 32 samples during merge
This should improve deduplication accuracy for blocks with higher number of samples.
2020-02-04 18:41:37 +02:00
Aliaksandr Valialkin
ccd3aa4f15 app/vmselect: take into account the time the requests wait in the queue if -search.maxConcurrentRequests is exceeded
This will prevent from excess CPU usage for timed out queries.
2020-02-04 16:20:48 +02:00
Aliaksandr Valialkin
e6bf88a4d4 app/vmselect: add a placeholder for /api/v1/metadata, which could be requested by Grafana
See https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata

VictoriaMetrics doesn't collect any metadata for metrics, so just return empty response.
2020-02-04 15:56:01 +02:00
Aliaksandr Valialkin
7cde594696 all: do not clash flag description with back-quoted flag types
See https://golang.org/pkg/flag/#PrintDefaults for more details.
2020-02-04 15:56:01 +02:00
Edouard Hur
2ec248453b do not fill max lines (#307) 2020-02-03 21:21:04 +00:00
Roman Khavronenko
ce8eb8a207 improve description for Pending datapoints panel; (#301)
Use bits/s for network usage panels;
2020-02-03 02:07:07 +02:00
Aliaksandr Valialkin
45bc6c62f2 app/vmselect/promql: adjust and and unless binary operator handling to be consistent with Prometheus 2020-01-31 18:52:51 +02:00
Aliaksandr Valialkin
36ea1b503b deployment/docker: update Go builder from v1.13.6 to v1.13.7 2020-01-31 18:06:10 +02:00
Aliaksandr Valialkin
9b25a2fb67 lib/fs: remove unused readerAt interface 2020-01-31 15:13:00 +02:00
Aliaksandr Valialkin
e3adc095bd all: add -dedup.minScrapeInterval command-line flag for data de-duplication
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/86
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/278
2020-01-31 01:18:54 +02:00
Aliaksandr Valialkin
a45f25699c lib/storage: re-use indexSearch inside Storage.prefetchMetricNames 2020-01-31 01:18:53 +02:00
Aliaksandr Valialkin
cb5c39ee70 lib/fs: optimize small reads for ReaderAt.MustReadAt by reading from memory-mapped space instead of reading from file descriptor
This should improve performance when reading many small blocks.
2020-01-30 15:16:16 +02:00
Aliaksandr Valialkin
da19fffa08 all: rename ReadAt* to MustReadAt* in order to dont clash with io.ReaderAt 2020-01-30 15:16:16 +02:00
Aliaksandr Valialkin
1332ddc15e lib/storage: pass missing AccountID and ProjectID to searchMetricName 2020-01-30 15:16:16 +02:00
Aliaksandr Valialkin
4ed5e9a7ce lib/storage: pre-fetch metricNames for the found metricIDs in Search.Init
This should speed up Search.NextMetricBlock loop for big number of found time series.
2020-01-30 15:16:16 +02:00
Alexander Danilov
ced989c966 Fix current/max graphs (#298) 2020-01-29 23:48:36 +00:00
Aliaksandr Valialkin
cb2a2f281f lib/mergeset: properly update lastAccesstime in indexBlockCache entries
This is a follow-up for 6665f10e7b
2020-01-29 21:21:01 +02:00
Aliaksandr Valialkin
170c1c3a4e app/vmselect/promql: add keep_next_value(q) for filling gaps with the next non-empty value 2020-01-29 00:48:14 +02:00
Aliaksandr Valialkin
b3bd64fdb2 docs/Single-server-VictoriaMetrics.md: fix heading size for Third-party contributions section 2020-01-28 23:14:06 +02:00
Aliaksandr Valialkin
a9c1d5b351 app/vminsert: moved -maxInsertRequestSize command-line flag out of lib/prompb in order to prevent its inclusion in vmselect and vmstorage apps 2020-01-28 22:53:50 +02:00
Aliaksandr Valialkin
b28c9a3944 app/vmselect/promql: return expected results from increase() over the beginning of time series, which start from big value
Examples for such counters: OS-level counters for network or cpu stats.
2020-01-28 16:31:05 +02:00
Aliaksandr Valialkin
11c03328ae app/victoria-metrics: remove integration build tag from tests
This simplifies testing with `go test ./app/victoria-metrics` without
the need to remember to pass `-tags=integration` to Go commands.
2020-01-27 20:27:39 +02:00
Aliaksandr Valialkin
9a02ca67e9 docs/Single-server-VictoriaMetrics.md: sync with master 2020-01-27 18:47:46 +02:00
Aliaksandr Valialkin
dab9a63485 docs/Single-server-VictoriaMetrics.md: update Retention section 2020-01-27 18:46:32 +02:00
Aliaksandr Valialkin
2bb9b089d5 README.md: mention https://github.com/AnchorFree/tsdb-remote-write 2020-01-27 18:36:22 +02:00
Aliaksandr Valialkin
3e304890a6 app/vmselect/promql: fix panic on a single zero vmrange bucket in prometheus_buckets() function
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/296
2020-01-27 18:05:12 +02:00
Aliaksandr Valialkin
81ba371eaf lib/logger: fix improperly set skipframes for all the logging functions 2020-01-26 18:34:58 +02:00
Aliaksandr Valialkin
9f595cb2b1 lib/httpserver: log the caller of httpserver.Errorf
Previously log message contained `httpserver.Errorf`, not it contains the caller of `httpserver.Errorf`, which is more useful.
2020-01-25 20:18:06 +02:00
Aliaksandr Valialkin
4d70a81e18 app/vminsert: do not drop pending rows if all the vmstorage backends are unavailable
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/294
2020-01-24 22:10:10 +02:00
Aliaksandr Valialkin
36a1a21d6e lib/protoparser: add parser for Prometheus exposition text format
This parser will be used by vmagent
2020-01-24 20:11:19 +02:00
Aliaksandr Valialkin
0cda6afa8e app/vminsert: move ingestion protocol parsers to lib/protoparser, so they could be re-used in the upcoming vmagent 2020-01-24 16:55:18 +02:00
Aliaksandr Valialkin
a9802fcb72 docs/Articles.md: add a link to https://medium.com/@valyala/billy-how-victoriametrics-deals-with-more-than-500-billion-rows-e82ff8f725da 2020-01-22 19:08:45 +02:00
Aliaksandr Valialkin
ea53a21b02 all: consistently log durations in seconds with millisecond precision
This should improve logs readability
2020-01-22 18:35:24 +02:00
Aliaksandr Valialkin
6eddce1d15 vendor: make vendor-update 2020-01-22 18:09:24 +02:00
Aliaksandr Valialkin
e1a264173a app/vmselect: mention the original query and time range in error messages
This should simplify debugging invalid or heavy queries.
2020-01-22 17:34:35 +02:00
Aliaksandr Valialkin
18a4503261 vendor: update github.com/klauspost/compress from v1.9.7 to v1.9.8
New version should have better gzip compression. See https://github.com/klauspost/compress#changelog
2020-01-22 16:51:17 +02:00
Aliaksandr Valialkin
3c6ae8c947 docs: Mention Slack and Telegram channels for user questions 2020-01-22 16:51:16 +02:00
Aliaksandr Valialkin
e127173984 app/vmselect: mention command-line flag, which could be used for adjusting query timeouts, in timeout errors 2020-01-22 15:53:42 +02:00
Aliaksandr Valialkin
f3b9f8b823 app/vmselect/prometheus: increase default value -maxExportDuration to 30 days, since 10 minutes beat users exporting bit amounts of data 2020-01-22 15:53:41 +02:00
Aliaksandr Valialkin
be5adbfda4 vendor: update github.com/VictoriaMetrics/fastcache from v1.5.5 to v1.5.7 2020-01-22 12:30:55 +02:00
Aliaksandr Valialkin
40e564eb9c app/vmselect/promql: add range_over_time(m[d]) function for calculating value range for m over d 2020-01-21 19:05:29 +02:00
Aliaksandr Valialkin
ecddba30fe app/vminsert/netstorage: increase timeout for pushing data from vminsert to vmstorage by 3x
Our clients report that the previous timeout could lead to frequent errors when
vmstorage starts background merge for big parts on slow HDD.
2020-01-21 18:21:49 +02:00
Aliaksandr Valialkin
9eaa2ab871 app/vmselect/promql: add label_match(q, label, regexp) and label_mismatch(q, label, regexp) functions for filtering out time series with labels matching the given regexp 2020-01-21 15:00:35 +02:00
Aliaksandr Valialkin
62b041e90a lib/{mergeset,storage}: properly update lastAccessTime in index and data block cache entries 2020-01-20 15:00:10 +02:00
Aliaksandr Valialkin
b297fec515 README.md: mention that delete API shouldnt be used on a regular basis due to non-zero overhead 2020-01-20 13:28:58 +02:00
Aliaksandr Valialkin
d3b4b0f492 docs/FAQ.md: typo fix according to comment from https://www.reddit.com/message/messages/lezkmo 2020-01-18 18:05:22 +02:00
Aliaksandr Valialkin
179c7db4c9 docs/CaseStudies.md: add links to COLOPL talk about VictoriaMetrics 2020-01-18 17:24:00 +02:00
Aliaksandr Valialkin
cbd0452317 app/vminsert: increase default value for -insert.maxQueueDuration from 30s to 60s
This should help catching up with high ingestion rate after VictoriaMetrics restart.
2020-01-18 14:39:30 +02:00
Aliaksandr Valialkin
607d4418b8 lib/uint64set: add missing bucket32.b16his values 2020-01-18 14:26:23 +02:00
Aliaksandr Valialkin
e3379537cd lib/uint64set: optimize Set.Union
This should improve performance for queries over big number of time series
2020-01-18 13:47:34 +02:00
Aliaksandr Valialkin
5077efd3f7 lib/uint64set: add benchmarks for Set.Union 2020-01-18 13:47:33 +02:00
Aliaksandr Valialkin
a851c75703 lib/storage: skip recovering timestamps order for lossless compression (PrecisionBits=64) 2020-01-17 23:59:19 +02:00
Aliaksandr Valialkin
2084921e64 all: use github.com/klauspost/compress/gzip instead of compress/gzip
`github.com/klauspost/compress/gzip` is more optimized than `compress/gzip`.
This gives better gzip compression and decompression speeds.
2020-01-17 23:59:17 +02:00
Aliaksandr Valialkin
ab4d5d72eb lib/uint64set: reduce memory allocations in Set.AppendTo 2020-01-17 22:33:00 +02:00
Aliaksandr Valialkin
476c7fb109 lib/storage: reduce memory allocations when merging metricID sets 2020-01-17 22:10:56 +02:00
Aliaksandr Valialkin
29d21259f0 lib/uint64set: typo fix in Set.Intersect 2020-01-17 18:11:46 +02:00
Aliaksandr Valialkin
54db08a60f app/vmselect/netstorage: make fmt 2020-01-17 17:46:20 +02:00
Aliaksandr Valialkin
d21cc2d16a app/vmselect/netstorage: limit the maximum size for in-memory buffer for temporary blocks file
This should reduce memory usage on systems with more than 8GB RAM.
2020-01-17 16:28:28 +02:00
Aliaksandr Valialkin
ed1d259b10 lib/uint64set: optimize Intersect, Subtract and Union functions
This should improve performance for queries over big number of time series.
2020-01-17 16:16:43 +02:00
Aliaksandr Valialkin
68d35357b1 lib/uint64set: improve benchmark for Set.Intersect 2020-01-17 16:16:43 +02:00
Aliaksandr Valialkin
b05f6cf11c app/vmselect: limit the default value for -search.maxConcurrentRequests, so it plays well on systems with more than 16 vCPUs
A single heavy request can saturate all the available CPUs, so let's limit the number of concurrent requests to lower value.
This will give more chances for executing insert path.
2020-01-17 16:16:43 +02:00
Aliaksandr Valialkin
a9f683423c app/{vminsert,vmselect}: improve error messages when VictoriaMetrics cannot handle too high number of concurrent inserts / selects 2020-01-17 16:16:43 +02:00
Aliaksandr Valialkin
ffe352ad31 lib/uint64set: add benchmark for Set.Intersect 2020-01-17 16:16:43 +02:00
Aliaksandr Valialkin
bdfb219992 make vendor-update 2020-01-16 14:17:48 +02:00
Aliaksandr Valialkin
4b16b7fd11 all: mention command-line flags used for limiting the incoming request size in error messages
This should improve error logs usability.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/287
2020-01-16 13:06:43 +02:00
Aliaksandr Valialkin
ce0b602405 app/vmselect/promql: fix panic on sum(aggr_over_time(...)) with incorrect number of args 2020-01-15 16:26:16 +02:00
Aliaksandr Valialkin
7d429e2806 lib/uint64set: reduce memory usage in Union, Intersect and Subtract methods
Iterate items with newly added Set.ForEach method instead of allocating `[]uint64`
slice for all the items before the iteration.
2020-01-15 12:15:48 +02:00
Aliaksandr Valialkin
4ecb7f15b6 docs/FAQ.md: add bullet comparison with Cortex and Thanos 2020-01-15 10:47:46 +02:00
Aliaksandr Valialkin
caffb0cd01 lib/{mergeset,storage}: fix uint64 counters alignment for 32-bit architectures (GOARCH=386, GOARCH=arm) 2020-01-14 22:47:42 +02:00
Aliaksandr Valialkin
b03ccbf6f7 lib/{storage,mergeset}: gradually remove stale entries from block cache and index caches
This should reduce memory usage in the long run when old blocks and indexes
aren't accessed anymore.
2020-01-14 21:38:29 +02:00
Aliaksandr Valialkin
8a4d4978a3 deployment/docker: update Prometheus from v2.14.0 to v2.15.2 and Grafana from v6.5.0 to v6.5.2 2020-01-12 23:14:56 +02:00
Aliaksandr Valialkin
cbafb7ae59 docs: add a link to VictoriaMetrics subreddit - https://www.reddit.com/r/VictoriaMetrics/ 2020-01-12 00:07:06 +02:00
Aliaksandr Valialkin
bcd3f0c5bd app/vmselect/promql: add hoeffding_bound_upper(phi, m[d]) and hoeffding_bound_lower(phi, m[d]) functions
These functions can be used for calculating Hoeffding bounds
for `m` over `d` time range and for the given `phi` in the range `[0..1]`.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/283
2020-01-11 14:47:13 +02:00
Aliaksandr Valialkin
fc01b11ddc app/vmselect/promql: return continuous values for min_over_time and max_over_time when step is smaller than scrape_interval 2020-01-11 12:47:57 +02:00
Aliaksandr Valialkin
92e00779fa deployment/docker: switch Go builder from v1.13.5 to v1.13.6 2020-01-11 11:06:12 +02:00
Aliaksandr Valialkin
2cacea8c64 README.md: mention about Prometheus->VictoriaMetrics exporter https://github.com/ryotarai/prometheus-tsdb-dump 2020-01-11 01:28:50 +02:00
Aliaksandr Valialkin
16fb128bbc app/vmselect/promql: do not take into account the previous point before time window in square brackets for min_over_time, max_over_time, rollup_first and rollup_last functions
This makes the behaviour for these functions similar to Prometheus when processing broken time series with irregular data points
like `gitlab_runner_jobs`. See https://gitlab.com/gitlab-org/gitlab-exporter/issues/50 for details.
2020-01-11 00:26:11 +02:00
Aliaksandr Valialkin
1c445bf7eb vendor: update github.com/valyala/fastjson from v1.4.2 to v1.4.5
This should fix parsing Inf values in `/api/v1/import`. The previous attempt to fix this in VictoriaMetrics v1.32.1 was unsuccessful.
2020-01-10 23:14:34 +02:00
Aliaksandr Valialkin
adc36d00b7 app/vmselect/promql: properly handle aggr(aggr_over_time(...)) 2020-01-10 21:57:11 +02:00
Aliaksandr Valialkin
87a106702b app/vmselect/promql: add aggr_over_time(("aggr_func1", "aggr_func2", ...), m[d]) function
This function can be used for simultaneous calculating of multiple `aggr_func*` functions
that accept range vector. For example, `aggr_over_time(("min_over_time", "max_over_time"), m[d])`
would calculate `min_over_time` and `max_over_time` for `m[d]`.
2020-01-10 21:18:12 +02:00
Aliaksandr Valialkin
c314d9a219 app/vmselect/promql: add tmin_over_time(m[d]) and tmax_over_time(m[d]) functions
These functions return timestamp in seconds for the minimum and maximum value for `m` over time range `d`
2020-01-10 19:39:34 +02:00
Aliaksandr Valialkin
706b33dc82 docs: fix spelling typos 2020-01-09 23:43:45 +02:00
Aliaksandr Valialkin
1029b6ab34 lib/backup/s3remote: check whether the file exists before deleting it
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/284
2020-01-09 23:20:51 +02:00
Aliaksandr Valialkin
705af61587 app/{vmbackup,vmrestore}: add backup complete file to backup when it is complete and check for this file before restoring from backup
This should prevent from restoring from incomplete backups.

Add `-skipBackupCompleteCheck` command-line flag to `vmrestore` in order to be able restoring from old backups without `backup complete` file.
2020-01-09 15:35:45 +02:00
Aliaksandr Valialkin
7edbd930d5 vendor: update github.com/valyala/fastjson from v1.4.1 to v1.4.2
This fixes parsing of `inf` and `nan` values in json lines passed to `/api/v1/import`
2020-01-08 20:48:08 +02:00
Aliaksandr Valialkin
da62e894dd README.md: remove height="200px" from logo image, since it is improperly displayed on smartphones 2020-01-08 20:29:37 +02:00
Aliaksandr Valialkin
24a852f900 README.md: typo fix 2020-01-08 14:44:28 +02:00
Aliaksandr Valialkin
7c6df1e51d app/vmselect/promql: skip rate calculation for the first point on time series 2020-01-08 14:42:44 +02:00
Aliaksandr Valialkin
7d8d921db9 docs: add references to Remote Write Storage Wars
Also mention than VictoriaMetrics uses less RAM than Thanos Store Gateway - see https://github.com/thanos-io/thanos/issues/448 for details.
2020-01-04 23:58:27 +02:00
Aliaksandr Valialkin
53e176ed67 lib/storage: limit maxRaRowsPerPartition by 500K for any number of rawRowsShardsPerPartition
This should reduce write amplification for high ingestion rate on multi-CPU systems
2020-01-04 23:58:23 +02:00
Aliaksandr Valialkin
1f941875db docs/CaseStudies.md: add link to Remote Write Storage Wars talk from Adidas at PromCon 2019 2020-01-04 16:51:02 +02:00
Aliaksandr Valialkin
76707b2ab9 app/vmselect/promql: fix calculations for histogram_share 2020-01-04 14:44:15 +02:00
Aliaksandr Valialkin
89b551201c lib/metricsql: export IsRollupFunc and IsTransformFunc, since they can be used by package users 2020-01-04 13:25:13 +02:00
Aliaksandr Valialkin
8cfd4decea LICENSE: update year 2020-01-04 13:21:11 +02:00
Aliaksandr Valialkin
accad01b3e app/vmselect/promql: add missing MetricName into netstorage.Result in tests 2020-01-04 12:53:14 +02:00
Aliaksandr Valialkin
6f29d37cb5 app/vmselect/promql: add histogram_share(le, buckets) function 2020-01-04 12:53:08 +02:00
Aliaksandr Valialkin
2290503140 app/vmselect/promql: add absent_over_time(m[d]) func similar to the function in Prometheus 2.16
See https://github.com/prometheus/prometheus/issues/2882
2020-01-04 12:53:01 +02:00
Aliaksandr Valialkin
67f94bbe12 app/vmselect/promql: add histogram_over_time(m[d]) rollup function 2020-01-04 12:52:56 +02:00
Aliaksandr Valialkin
9a1f6848ca app/vmselect/promql: fix results caching for multi-arg rollup functions such as quantile_over_time
Previosly only a single arg was taken into account, so caching didn't work properly for multi-arg rollup funcs.
2020-01-03 20:44:54 +02:00
Aliaksandr Valialkin
3d0c7b095a app/vmselect/promql: use scrapeInterval instead of window in denominator when calculating rate for the first point on the time series
This should provide better estimation for `rate` in the beginning of time series.
2020-01-03 19:03:32 +02:00
Aliaksandr Valialkin
588531dd76 lib/uint64set: reduce memory usage when storing big number of sparse metric_id values 2020-01-03 18:17:17 +02:00
Aliaksandr Valialkin
6ea7f23446 app/vmselect/promql: increase the estimated number of time series returned by aggr() by (something) from 100 to 1K, since 100 may result in OOM for high number of time series 2020-01-03 01:02:30 +02:00
Aliaksandr Valialkin
e0abf45d45 app/vmselect/promql: add share_le_over_time and share_gt_over_time functions for SLI and SLO calculations 2020-01-03 00:41:36 +02:00
Aliaksandr Valialkin
19962e2732 docs: refer to standalone MetricsQL package 2020-01-02 23:43:43 +02:00
Aliaksandr Valialkin
a15a6d9ac1 vendor: update github.com/VictoriaMetrics/fastcache from v1.5.4 to v1.5.5 2019-12-29 18:18:12 +02:00
Aliaksandr Valialkin
0d2e83e9d7 lib/metricsql: add example for ExpandWithExprs 2019-12-26 21:31:15 +02:00
Aliaksandr Valialkin
e3ae813e6a vendor: make vendor-update 2019-12-26 19:42:53 +02:00
Aliaksandr Valialkin
940c55f9d1 vendor: update github.com/valyala/gozstd from v1.6.3 to v1.6.4
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/281
2019-12-26 19:31:07 +02:00
Aliaksandr Valialkin
eb1a66c577 lib/metricsq: add ExpandWithExprs 2019-12-25 22:20:21 +02:00
Aliaksandr Valialkin
453d71d082 Rename lib/promql to lib/metricsql and apply small fixes 2019-12-25 22:09:09 +02:00
Mike Poindexter
009d1559db Split Extended PromQL parsing to a separate library 2019-12-25 22:09:07 +02:00
Aliaksandr Valialkin
ff18101d30 app/vmselect/promql: make sure AdjustStartEnd returns time range covering the same number of points as the initial time range
This should prevent from the following panic at app/vmselect/promql/binary_op.go:255:

    BUG: len(leftVaues) must match len(rightValues) and len(dstValues)
2019-12-24 22:45:49 +02:00
Aliaksandr Valialkin
f22c9dbb0f lib/fs: typo fix in fadvise_unix.go 2019-12-24 21:00:04 +02:00
Aliaksandr Valialkin
d3c185f0ca lib/encoding: log the compressed block contents if it cannot be decompressed or unmarshaled
This should help detecting the root cause of https://github.com/VictoriaMetrics/VictoriaMetrics/issues/281
2019-12-24 20:48:25 +02:00
Aliaksandr Valialkin
091e35cf0c lib/encoding: mention src contents in error message returned from unmarshalInt64NearestDelta*
This should simplify detecting the root cause of the issue at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/281
2019-12-24 20:41:38 +02:00
Aliaksandr Valialkin
0e51058a0d lib/encoding: mention unpacked block size in the error message if unparsed tail left
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/281
2019-12-24 20:35:20 +02:00
Aliaksandr Valialkin
e24ee43109 app/vmselect/promql: adjust calculations for rate and increase for the first value
These calculations should trigger alerts on `/api/v1/query` for counters starting from values greater than 0.
2019-12-24 19:41:03 +02:00
Aliaksandr Valialkin
9a2554691c app/vmselect/promql: properly calculate rate on the first data point
It is calculated as `value / scrape_interval`, since the value was missing on the previous scrape,
i.e. we can assume its value was 0 at this time.
2019-12-24 15:55:15 +02:00
Aliaksandr Valialkin
97de50dd4c app/vmselect/netstorage: improve error message when reading data size in readBytes 2019-12-24 14:40:14 +02:00
Aliaksandr Valialkin
c0060c5858 deployment/docker: remove Docker image tag in docker-compose images
This should allow loading latest images by default
2019-12-24 13:27:42 +02:00
Aliaksandr Valialkin
29d2ce54cb all: use gozstd instead of pure Go zstd for GOARCH=amd64 2019-12-24 12:43:59 +02:00
Aliaksandr Valialkin
afa8b34d27 Revert "lib/logger: prevent from blocking when log output isn't consumed in timely manner"
This reverts commit 9f50232e70.

Reason to revert: this leaves incomplete logs on app shutdown.
2019-12-24 12:20:45 +02:00
Aliaksandr Valialkin
6358cf3d47 app/vmselect/netstorage: move MustAdviseSequentialRead to lib/fs 2019-12-23 23:16:26 +02:00
Aliaksandr Valialkin
44f886cc9c lib/encoding/zstd: typo fix 2019-12-23 18:37:20 +02:00
Aliaksandr Valialkin
108a60d69e lib/encoding/zstd: call zstd.Decoder.Close instead of zstd.Decoder.Reset in order to free up occupied goroutines
This should fix goroutine leak for https://github.com/klauspost/compress/issues/195
2019-12-23 18:32:28 +02:00
Aliaksandr Valialkin
335bd0ac0a lib/encoding/zstd: prevent from possible encoder leak when concurrent goroutines create encoders for the same compressionLevel
Thanks to @klauspost for the pointer to this issue. See https://github.com/klauspost/compress/issues/195 for details.
2019-12-23 18:06:02 +02:00
Aliaksandr Valialkin
ba17fcbcc5 deployment/docker: update docker image tags from v1.31.1-cluster to v1.31.2-cluster 2019-12-20 13:06:49 +02:00
Aliaksandr Valialkin
9f50232e70 lib/logger: prevent from blocking when log output isn't consumed in timely manner
Drop log messages instead of blocking and increment `vm_log_messages_dropped_total` metric.
2019-12-20 11:49:42 +02:00
Aliaksandr Valialkin
cc8a1bae0e app/vmselect: add -search.maxExportDuration command-line flag for limiting /api/v1/export duration
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/275
2019-12-20 11:37:18 +02:00
Aliaksandr Valialkin
a37a006f11 lib/storage: scale ingestion performance by sharding rawRows on systems with more than 8 CPU cores 2019-12-19 18:17:05 +02:00
Aliaksandr Valialkin
8d79412b26 lib/storage: optimize bulk import performance when multiple data points are inserted for the same time series
This should speed up `/api/v1/import` and make it more scalable on multi-core systems.
2019-12-19 15:13:36 +02:00
Aliaksandr Valialkin
8b56b849e9 app/vminsert: return StatusNoContent http response for /api/v1/import to be consistent with other insert handlers 2019-12-19 01:22:01 +02:00
Aliaksandr Valialkin
05ec8afb3a lib/httpserver: sync the code with master branch 2019-12-18 23:08:32 +02:00
Aliaksandr Valialkin
a045c62532 docs/ExtendedPromQL.md: rewording regarding scalar vs instant vector difference 2019-12-18 21:47:31 +02:00
Aliaksandr Valialkin
cd04f6e82d docs/Home.md: fix link to case studies 2019-12-18 01:05:15 +02:00
Aliaksandr Valialkin
4e8583bb02 docs: renaming: PromQL extensions -> MetricsQL 2019-12-18 00:57:53 +02:00
Aliaksandr Valialkin
198debc1c6 deployment/docker: update docker image tags from v1.30.6-cluster to v1.31.1-cluster 2019-12-16 01:49:00 +02:00
Aliaksandr Valialkin
6a185b7809 app/vmselect: add ability to pass match[], start and end to /api/v1/labels
This makes the `/api/v1/labels` handler consistent with already existing functionality for `/api/v1/label/.../values`.

See https://github.com/prometheus/prometheus/issues/6178 for more details.
2019-12-15 00:20:43 +02:00
Aliaksandr Valialkin
a7bf8e77af app/vminsert: simultaneously accept telnet put and HTTP /api/put OpenTSDB metrics at -opentsdbListenAddr
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/266
2019-12-14 00:42:18 +02:00
Aliaksandr Valialkin
bc3984a5b3 lib/logger: add -loggerFormat for choosing log message formats
Supported formats: default, json

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/265
2019-12-13 15:09:18 +02:00
Aliaksandr Valialkin
aaf2545bdb make vendor-update 2019-12-12 19:38:56 +02:00
Aliaksandr Valialkin
b238997a84 all: rename Extended PromQL to PromQL extensions 2019-12-12 19:29:59 +02:00
Aliaksandr Valialkin
bf8cf77694 docs: sync with master branch 2019-12-12 14:54:42 +02:00
Aliaksandr Valialkin
fef2eefb5e docs: add Dreamteam numbers 2019-12-12 01:01:58 +02:00
Aliaksandr Valialkin
aad6ac76b9 docs/Single-server-VictoriaMetrics.md: sync with README.md 2019-12-12 00:56:04 +02:00
Aliaksandr Valialkin
cffaeda0f1 all: publish Docker images for the following GOARCH: amd64, arm, arm64, ppc64le and 386
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/258
2019-12-11 23:33:11 +02:00
Aliaksandr Valialkin
c25b97829f app/vmselect/promql: return lower and upper bounds for the estimated percentile from histogram_quantile if third arg is passed
Updates https://github.com/prometheus/prometheus/issues/5706
2019-12-11 14:00:18 +02:00
Aliaksandr Valialkin
557909aa81 docs: sync with master branch 2019-12-11 14:00:15 +02:00
Aliaksandr Valialkin
f79b61e2a1 app/vmselect/promql: return matrix instead of vector on subqueries to /api/v1/query like Prometheus does 2019-12-11 00:57:54 +02:00
Aliaksandr Valialkin
5d2ff573aa app/vmselect/promql: allow negative offsets
Updates https://github.com/prometheus/prometheus/issues/6282
2019-12-11 00:57:51 +02:00
Aliaksandr Valialkin
c444a929a6 deployment/docker: update Docker image tags from v1.30.5-cluster to v1.30.6-cluster 2019-12-10 00:13:59 +02:00
Aliaksandr Valialkin
7edfa4d0cc docs: use relative links 2019-12-09 23:05:39 +02:00
Aliaksandr Valialkin
e81a2bfdb3 docs: mention about /api/v1/import in Single-server-VictoriaMetrics.md 2019-12-09 23:05:38 +02:00
Aliaksandr Valialkin
033d252836 docs: mention about /api/v1/import in Cluster-VictoriaMetrics.md 2019-12-09 23:00:37 +02:00
Aliaksandr Valialkin
bd60dcb8ed deployment/docker: update Go builder from go1.13.4 to go1.13.5 2019-12-09 22:57:43 +02:00
Aliaksandr Valialkin
c81a89a8ed app/vminsert: add /api/v1/import handler
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
2019-12-09 22:37:49 +02:00
Aliaksandr Valialkin
0c304439d4 app/vminsert: consistency renaming for counters 2019-12-09 16:44:26 +02:00
Aliaksandr Valialkin
3694efd005 lib/{mergeset,storage}: log info message when both source and destination part paths from txn are missing during startup
This is expected condition after unclean shutdown (OOM, hard reset, `kill -9`) on NFS disk.
2019-12-09 15:45:23 +02:00
Roman Khavronenko
924af22ced #251 - add Logging rate panel (#255) 2019-12-09 13:07:09 +02:00
Aliaksandr Valialkin
b809df03f8 vendor: fix broken build for GOARCH=arm64 on golang.org/x/sys/unix 2019-12-08 13:28:20 +02:00
Aliaksandr Valialkin
9442e619ea vendor: fix arm build for golang.org/x/sys/unix/zptrace_armnn_linux.go 2019-12-08 12:49:21 +02:00
Aliaksandr Valialkin
c217a53c35 make vendor-update 2019-12-07 23:11:26 +02:00
Aliaksandr Valialkin
3534e71c96 app/vminsert/influx: add a test case from https://community.librenms.org/t/integration-with-victoriametrics/9689 2019-12-07 23:00:54 +02:00
Aliaksandr Valialkin
8cf015c34f README.md: mention that VictoriaMetrics is built on shared nothing architecture 2019-12-05 20:38:20 +02:00
Aliaksandr Valialkin
7a775714ab deployment/docker: update Docker image tags from v1.30.4-cluster to v1.30.5-cluster 2019-12-05 20:15:50 +02:00
Aliaksandr Valialkin
e243429b39 docs: add docs as in the single-node branch 2019-12-05 19:28:29 +02:00
Aliaksandr Valialkin
d39bba3547 app/vmselect/promql: add {topk|bottomk}_{min|max|avg|median} aggregate functions for returning the exact k time series on the given time range
The full list of functions added:
- `topk_min(k, q)` - returns top K time series with the max minimums on the given time range
- `topk_max(k, q)` - returns top K time series with the max maximums on the given time range
- `topk_avg(k, q)` - returns top K time series with the max averages on the given time range
- `topk_median(k, q)` - returns top K time series with the max medians on the given time range
- `bottomk_min(k, q)` - returns bottom K time series with the min minimums on the given time range
- `bottomk_max(k, q)` - returns bottom K time series with the min maximums on the given time range
- `bottomk_avg(k, q)` - returns bottom K time series with the min averages on the given time range
- `bottomk_median(k, q)` - returns bottom K time series with the min medians on the given time range
2019-12-05 19:27:45 +02:00
Aliaksandr Valialkin
639967db59 lib/{mergeset,storage}: make sure pending transaction deletions are finished before and after runTransactions call.
`runTransactions` call issues async deletions for transaction files. The previously issued transaction deletions
can race with the next call to `runTransactions`. Prevent this by waiting until all the pending transaction
deletions are funished in the beginning of `runTransactions`. Also make sure that all the pending transaction
deletions are finished before returning from `runTransactions`.
2019-12-04 21:40:52 +02:00
Aliaksandr Valialkin
7c0dd85a7c lib/httpserver: add /ping handler for compatibility with Influx agents
Certain Influx agents check for `/ping` endpoint before starting
to send Influx line protocol data. See https://docs.influxdata.com/influxdb/v1.7/tools/api/#ping-http-endpoint
2019-12-04 19:18:18 +02:00
Aliaksandr Valialkin
877b83ce97 deployment/docker: update docker image tags from v1.30.3-cluster to v1.30.4-cluster 2019-12-04 01:53:04 +02:00
Aliaksandr Valialkin
e0f43e1f66 app/vmselect: add placeholders for /api/v1/rules and /api/v1/alerts 2019-12-03 19:38:09 +02:00
Aliaksandr Valialkin
534da0a8c3 lib/storage: fall back to global inverted index if a filter match too many time series in per-day index
Previously this resulted to error message. The query may succeed via search in global index.
2019-12-03 14:48:08 +02:00
Aliaksandr Valialkin
6eb698d1cc lib/storage: fix printing tag filters in TagFilters.String 2019-12-03 14:25:20 +02:00
Aliaksandr Valialkin
c04f60db35 lib/storage: print __name__ instead of empty string in user-visible tag filters 2019-12-03 14:18:18 +02:00
Aliaksandr Valialkin
625f6ca761 lib/storage: optimize regexp filter search 2019-12-03 00:33:53 +02:00
Aliaksandr Valialkin
47077c02ba deployment/docker: update image tags from v1.30.2-cluster to v1.30.3-cluster 2019-12-02 22:51:31 +02:00
Aliaksandr Valialkin
6bee9115aa vendor: update github.com/VictoriaMetrics/metrics from v1.9.1 to v1.9.2
This fixes possible deadlock when metrics.WritePrometheus calls Gauge callback, which calls metrics functions with internal lock.
2019-12-02 22:31:47 +02:00
Aliaksandr Valialkin
b9616c017f lib/{mergeset,storage}: remove transaction files only after the mentioned dirs are really removed
This should fix the issue on NFS when incompletely removed dirs may be left
after unclean shutdown (OOM, kill -9, hard reset, etc.), while the corresponding transaction
files are already removed.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
2019-12-02 21:34:37 +02:00
Aliaksandr Valialkin
4e22b521c2 lib/storage: remove metricID with missing metricID->metricName entry
The metricID->metricName entry can be missing in the indexdb after unclean shutdown
when only a part of entries for new time series is written into indexdb.

Recover from such a situation by removing the broken metricID. New metricID
will be automatically created for time series with the given metricName
when new data point will arive to it.
2019-12-02 20:52:13 +02:00
Aliaksandr Valialkin
387f62f468 deployment/docker: update docker image tag from v1.30.1-cluster to v1.30.2-cluster 2019-12-02 15:17:41 +02:00
Aliaksandr Valialkin
5a62415bec lib/storage: protect from time drift during indexdb rotation
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/248
2019-12-02 14:43:11 +02:00
Aliaksandr Valialkin
cf85c567d1 lib/logger: merge file and line labels into location="file:line"
This should improve the usability for `vm_log_messages_total` metric during practical queries
2019-12-02 14:43:09 +02:00
Aliaksandr Valialkin
f055dbefda lib/storage: generate more human-friendly result in TagFilters.String 2019-12-02 13:56:40 +02:00
Aliaksandr Valialkin
819bb36852 app/vmselect/promql: estimate per-series scrape interval as 0.6 quantile for the first 100 intervals
This should improve scrape interval estimation for tiem series with gaps.
2019-12-02 13:43:04 +02:00
Aliaksandr Valialkin
29f39f866e lib/logger: consistency renaming from vm_log_messages_count to vm_log_messages_total, since this is a counter 2019-12-02 00:47:12 +02:00
Aliaksandr Valialkin
15eaff1745 lib/logger: track the number of log messages by (level, file, line) in the vm_log_messages_count metric 2019-12-01 18:38:30 +02:00
Aliaksandr Valialkin
d456ec7589 lib/netutil: use IPv6 for both listening and dialing if -enabledTCP6 is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/244
2019-12-01 02:52:53 +02:00
Aliaksandr Valialkin
1595dcd3d9 app/vminsert/influx: allow empty measurement in Influx line protocol
In this case metric names are mapped directly from field names without any prefixes.
2019-11-30 21:59:07 +02:00
Aliaksandr Valialkin
1e2019b1b6 app/vmselect/promql: fix corner case for increase over time series with gaps
In this case `increase` could return invalid high value for the first point after the gap.
2019-11-30 01:34:18 +02:00
Aliaksandr Valialkin
4c63caa37c deployment/docker/certs: update TLS certs source from alpine:3.9 to alpine:3.10 2019-11-29 19:55:36 +02:00
Aliaksandr Valialkin
274d8bcb7b deployment/docker/docker-compose.yml: remove superflouos volume mount (#246)
The `provisioning/dashboards` folder should be already mounted on the previous line.

This should fix the `/bin/sh: can't create dashboards/vm.json: Permission denied` error on `docker-compose up`
2019-11-29 18:10:50 +02:00
Aliaksandr Valialkin
7e734433a3 lib/backup: cosmetic fixes after #243 2019-11-29 18:07:41 +02:00
glebsam
4a192cb832 Add option to provide custom endpoint for S3, add option to specify S3 config profile (#243)
* Add option to provide custom endpoint for S3 for use with s3-compatible storages, add option to specify S3 config profile

* make fmt
2019-11-29 18:07:39 +02:00
Aliaksandr Valialkin
4810f1dde6 lib/netutil: add -enableTCP6 command-line flag for enabling listening for IPv6 additionally to IPv4 TCP ports 2019-11-29 17:33:07 +02:00
Aliaksandr Valialkin
93dbec971b deployment/docker: update docker image tags from v1.30.0-cluster to v1.30.1-cluster 2019-11-28 22:26:25 +02:00
Aliaksandr Valialkin
90f2530f9f README.md: add monitoring section 2019-11-28 19:16:05 +02:00
Aliaksandr Valialkin
409c939621 lib/backup: remove flock.lock file in empty dirs
This fixes an issue when VictoriaMetrics doesn't see the restored data after the following operations:

1. Stop VictoriaMetrics.
2. Delete `<-storageDataPath>` dir.
3. Start VictoriaMetrics, then stop it.
4. Restore data from backup with `vmrestore`.
5. Start VictoriaMetrics.

`vmrestore` didn't delete properly empty dirs in `<-storageDataPath>/indexdb` because of the remaining `flock.lock` files in these dirs.
2019-11-28 13:39:28 +02:00
Aliaksandr Valialkin
572fe61857 README.md: remove the unnecessary step during restoring from backups 2019-11-27 19:56:15 +02:00
Aliaksandr Valialkin
396ed27759 vendor: make vendor-update 2019-11-27 15:34:18 +02:00
Aliaksandr Valialkin
2571903522 vendor: update github.com/VictoriaMetrics/fastcache from v1.5.2 to v1.5.4 2019-11-27 15:31:36 +02:00
Aliaksandr Valialkin
093f94d2db deployment/docker: update Grafana from v6.4.4 to v6.5.0 2019-11-27 15:10:01 +02:00
Aliaksandr Valialkin
8ccbcaf99f deployment/docker: update image tags from v1.29.5-cluster to v1.30.0-cluster 2019-11-27 14:54:21 +02:00
Aliaksandr Valialkin
def9ccd360 app/vmselect/prometheus: consistently apply nocache arg to /api/v1/query the same way ast to /api/v1/query_range
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/241
2019-11-26 22:55:50 +02:00
Aliaksandr Valialkin
e0ac068112 app/vmselect/prometheus: fix content-type for /api/v1/export responses
The correct Content-Type should be `application/stream+json` instead of `application/json`
Thanks to Joshua Ryder for pointing to this.
2019-11-26 17:44:27 +02:00
Aliaksandr Valialkin
28cc4c09b5 app/vmselect/promql: remove zero timeseries from prometheus_buckets output 2019-11-25 19:10:13 +02:00
Aliaksandr Valialkin
8811bec14e app/vmselect/prometheus: reduce default value for -search.latencyOffset from 60s to 30s
30 seconds should be enough for almost all the cases
2019-11-25 16:33:36 +02:00
Aliaksandr Valialkin
f7da9b2db2 app/vmselect/promql: allow nested parens 2019-11-25 16:13:33 +02:00
Aliaksandr Valialkin
d2619d6dce vendor: update github.com/VictoriaMetrics/metrics from v1.9.0 to v1.9.1 2019-11-25 15:22:50 +02:00
Aliaksandr Valialkin
f46fb6c740 app/vmselect/promql: re-use metrics.Histogram when calculating histogram function for each point on the graph
This should reduce the amounts memory allocations
2019-11-25 14:24:30 +02:00
Aliaksandr Valialkin
0f184affa7 app/vmselect/promql: optimize binary search over big number of samples during rollup calculations 2019-11-25 14:01:54 +02:00
Aliaksandr Valialkin
dbd07041ae app/vmselect/promql: adjust tests after the upgrade of github.com/VictoriaMetrics/metrics from v1.8.3 to v1.9.0 2019-11-25 13:44:08 +02:00
Aliaksandr Valialkin
406e36f817 vendor: update github.com/VictoriaMetrics/metrics from v1.8.3 to v1.9.0 2019-11-25 13:19:34 +02:00
Aliaksandr Valialkin
8bb254d960 app/vmselect/promql: add histogram aggregate function, which is useful for building heatmaps from multiple time series 2019-11-24 00:04:15 +02:00
Aliaksandr Valialkin
e70f543321 vendor: update github.com/VictoriaMetrics/metrics from v1.8.2 to v1.8.3 2019-11-24 00:04:14 +02:00
Aliaksandr Valialkin
d24fc87a6f lib/decimal: calculate ln2/ln10 constant during compile time 2019-11-23 15:52:39 +02:00
Aliaksandr Valialkin
414259f47b app/vmselect/promql: do not take into account buckets with negative counters in prometheus_buckets 2019-11-23 14:19:19 +02:00
Aliaksandr Valialkin
193d553f6d app/vmselect/promql: properly handle histogram_quantile(0, ...) with zero buckets 2019-11-23 14:02:25 +02:00
Aliaksandr Valialkin
f8298c7f13 app/vmselect: add vm_per_query_{rows,series}_processed_count histograms 2019-11-23 13:23:03 +02:00
Aliaksandr Valialkin
b1c3284fd0 dashboards: remove deprecated dashboards - now only victoriametrics.json is officially supported 2019-11-23 12:43:38 +02:00
Aliaksandr Valialkin
654473f6c6 vendor: update github.com/VictoriaMetrics/metrics from v1.8.1 to v1.8.2 2019-11-23 11:49:18 +02:00
Aliaksandr Valialkin
4d76977745 app/vmselect/promql: transparently apply prometheus_buckets in histogram_quantile 2019-11-23 11:49:16 +02:00
Aliaksandr Valialkin
cfeb606e73 vendor: update github.com/VictoriaMetrics/metrics from v1.8.0 to v1.8.1 2019-11-23 00:48:55 +02:00
Aliaksandr Valialkin
2af7ca1122 vendor: update github.com/VictoriaMetrics/metrics from v1.7.2 to v1.8.0. This version supports histograms 2019-11-23 00:21:57 +02:00
Aliaksandr Valialkin
5f6f03c692 app/vmselect/promql: add prometheus_buckets function for converting the upcoming histogram buckets from github.com/VictoriaMetrics/metrics to Prometheus-compatible buckets 2019-11-23 00:21:56 +02:00
Aliaksandr Valialkin
17d08c1fe0 app/vmselect: adjust end arg instead of adjusting start arg if start > end
`start` arg has higher chances to be set properly comparing to `end` arg,
so it is expected that the `end` arg could be adjusted if it was set incorrectly.
2019-11-22 16:12:53 +02:00
Aliaksandr Valialkin
14ba958e9a deployment/docker: update docker image tag from v1.29.3-cluster to v1.29.5-cluster 2019-11-22 14:05:17 +02:00
Aliaksandr Valialkin
7c48f8611f vendor: updated github.com/valyala/gozstd from v1.6.2 to v1.6.3 2019-11-21 23:56:27 +02:00
Aliaksandr Valialkin
b9e53490b9 lib/storage: move non-matching tag filters to the top at matchTagFilters
This should reduce the amount of useless work needed for matching the next metricNames.
2019-11-21 21:40:36 +02:00
Aliaksandr Valialkin
33d9d63393 lib/storage: speed up time series search for queries with multiple filters
Use optimized specialized binary search for uint64 metricIDs instead of generic sort.Search.
2019-11-21 18:43:40 +02:00
Aliaksandr Valialkin
926290d73e Makefile: create files with sha256 checksums during make release
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/19
2019-11-20 22:45:23 +02:00
Aliaksandr Valialkin
a02a57fbe9 lib/storage: verify the number of returned metricIDs in BenchmarkHeadPostingForMatchers 2019-11-20 15:40:03 +02:00
Aliaksandr Valialkin
3d1f4408cf lib/decimal: increase decimal->float speed conversion for integer numbers 2019-11-20 14:09:10 +02:00
Aliaksandr Valialkin
f1f2eff08f lib/decimal: reduce rounding error when converting from decimal to float with negative exponent
While at it, slightly increase the conversion performance by moving fast path to the top of the loop.
2019-11-19 23:34:41 +02:00
Aliaksandr Valialkin
2929a41e3b make vendor-update 2019-11-19 21:34:44 +02:00
Aliaksandr Valialkin
17eca31989 lib/backup: retrieve only the required metadata when reading GCS objects 2019-11-19 21:30:51 +02:00
Aliaksandr Valialkin
ccf3d143c5 make vendor-update 2019-11-19 21:30:49 +02:00
Aliaksandr Valialkin
216a260ced app/{vmbackup,vmrestore}: add -maxBytesPerSecond command-line flag for limiting the used network bandwidth during backup / restore 2019-11-19 20:32:43 +02:00
Aliaksandr Valialkin
9d1ee1e2ae lib/backup: prevent from restoring to directory which is in use by VictoriaMetrics during the restore 2019-11-19 18:35:59 +02:00
Aliaksandr Valialkin
5ae47e8940 app/vmselect/prometheus: properly adjust too big time time on /api/v1/query
Too big `time` must be adjusted to `now()-queryOffset`.
2019-11-19 00:42:07 +02:00
Aliaksandr Valialkin
6ca4b94511 lib/storage: increase the number of created time series in BenchmarkHeadPostingForMatchers in order to be on par with Promethues
The previous commit was accidentally creating 10x smaller number of time series than Prometheus
and this led to invalid benchmark results.

The updated benchmark results:

benchmark                                                          old ns/op      new ns/op     delta
BenchmarkHeadPostingForMatchers/n="1"                              272756688      6194893       -97.73%
BenchmarkHeadPostingForMatchers/n="1",j="foo"                      138132923      10781372      -92.19%
BenchmarkHeadPostingForMatchers/j="foo",n="1"                      134723762      10632834      -92.11%
BenchmarkHeadPostingForMatchers/n="1",j!="foo"                     195823953      10679975      -94.55%
BenchmarkHeadPostingForMatchers/i=~".*"                            7962582919     100118510     -98.74%
BenchmarkHeadPostingForMatchers/i=~".+"                            7589543864     154955671     -97.96%
BenchmarkHeadPostingForMatchers/i=~""                              1142371741     258003769     -77.42%
BenchmarkHeadPostingForMatchers/i!=""                              9964150263     159783895     -98.40%
BenchmarkHeadPostingForMatchers/n="1",i=~".*",j="foo"              216995884      10937895      -94.96%
BenchmarkHeadPostingForMatchers/n="1",i=~".*",i!="2",j="foo"       202541348      10990027      -94.57%
BenchmarkHeadPostingForMatchers/n="1",i!=""                        486285711      87004349      -82.11%
BenchmarkHeadPostingForMatchers/n="1",i!="",j="foo"                350776931      53342793      -84.79%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",j="foo"              380888565      54256156      -85.76%
BenchmarkHeadPostingForMatchers/n="1",i=~"1.+",j="foo"             89500296       21823279      -75.62%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",i!="2",j="foo"       379529654      46671359      -87.70%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",i!~"2.*",j="foo"     424563825      53915842      -87.30%

VictoriaMetrics uses 1GB of RAM during the benchmark (vs 3.5GB of RAM for Prometheus)
2019-11-18 19:48:27 +02:00
Aliaksandr Valialkin
6f61fd367a lib/storage: add BenchmarkHeadPostingForMatchers similar to the benchmark from Prometheus
See the corresponding benchmark in Prometheus - 23c0299d85/tsdb/head_bench_test.go (L52)

The benchmark allows performing apples-to-apples comparison of time series search
in Prometheus and VictoriaMetrics. The following article - https://www.robustperception.io/evaluating-performance-and-correctness -
contains incorrect numbers for VictoriaMetrics, since there wasn't this benchmark yet. Fix it.

Benchmarks can be repeated with the following commands from Prometheus and VictoriaMetrics source code roots:

- Prometheus: GOMAXPROCS=1 go test ./tsdb/ -run=111 -bench=BenchmarkHeadPostingForMatchers
- VictoriaMetrics: GOMAXPROCS=1 go test ./lib/storage/ -run=111 -bench=BenchmarkHeadPostingForMatchers

Benchmark results:
benchmark                                                          old ns/op      new ns/op     delta
BenchmarkHeadPostingForMatchers/n="1"                              272756688      364977        -99.87%
BenchmarkHeadPostingForMatchers/n="1",j="foo"                      138132923      1181636       -99.14%
BenchmarkHeadPostingForMatchers/j="foo",n="1"                      134723762      1141578       -99.15%
BenchmarkHeadPostingForMatchers/n="1",j!="foo"                     195823953      1148056       -99.41%
BenchmarkHeadPostingForMatchers/i=~".*"                            7962582919     8716755       -99.89%
BenchmarkHeadPostingForMatchers/i=~".+"                            7589543864     12096587      -99.84%
BenchmarkHeadPostingForMatchers/i=~""                              1142371741     16164560      -98.59%
BenchmarkHeadPostingForMatchers/i!=""                              9964150263     12230021      -99.88%
BenchmarkHeadPostingForMatchers/n="1",i=~".*",j="foo"              216995884      1173476       -99.46%
BenchmarkHeadPostingForMatchers/n="1",i=~".*",i!="2",j="foo"       202541348      1299743       -99.36%
BenchmarkHeadPostingForMatchers/n="1",i!=""                        486285711      11555193      -97.62%
BenchmarkHeadPostingForMatchers/n="1",i!="",j="foo"                350776931      5607506       -98.40%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",j="foo"              380888565      6380335       -98.32%
BenchmarkHeadPostingForMatchers/n="1",i=~"1.+",j="foo"             89500296       2078970       -97.68%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",i!="2",j="foo"       379529654      6561368       -98.27%
BenchmarkHeadPostingForMatchers/n="1",i=~".+",i!~"2.*",j="foo"     424563825      6757132       -98.41%

The first column (old) is for Prometheus, the second column (new) is for VictoriaMetrics.

Prometheus was using 3.5GB of RAM during the benchmark, while VictoriaMetrics was using 400MB of RAM.
2019-11-18 18:47:02 +02:00
Aliaksandr Valialkin
77bb66a5be app/vmselect/promql: properly calculate integrate(q[d]) 2019-11-13 21:11:03 +02:00
Aliaksandr Valialkin
c33640664a app/vmselect/promql: use universal approach for determining maxByteSliceLen on 32-bit and 64-bit archs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/235
2019-11-13 20:26:07 +02:00
Aliaksandr Valialkin
d297b65089 lib/storage: add vm_cache_size_bytes{type="storage/hour_metric_ids"} metric 2019-11-13 20:26:05 +02:00
Aliaksandr Valialkin
31376fd353 deployment/docker: update docker image tag from v1.29.2-cluster to v1.29.3-cluster 2019-11-13 18:32:08 +02:00
Aliaksandr Valialkin
494ad0fdb3 lib/storage: remove inmemory index for recent hour, since it uses too much memory
Production workload shows that the index requires ~4Kb of RAM per active time series.
This is too much for high number of active time series, so let's delete this index.

Now the queries should fall back to the index for the current day instead of the index
for the recent hour. The query performance for the current day index should be good enough
given the 100M rows/sec scan speed per CPU core.
2019-11-13 18:08:58 +02:00
Aliaksandr Valialkin
90bde025f0 deployment/docker: update image tag from v1.29.0-cluster to v1.29.2-cluster 2019-11-13 15:24:44 +02:00
Aliaksandr Valialkin
633dd81bb5 lib/storage: add -disableRecentHourIndex flag for disabling inmemory index for recent hour
This may be useful for saving RAM on high number of time series aka high cardinality
2019-11-13 15:10:12 +02:00
Aliaksandr Valialkin
f1620ba7c0 lib/storage: fix inmemory inverted index issues found in v1.29
Issues fixed:
- Slow startup times. Now the index is loaded from cache during start.
- High memory usage related to superflouos index copies every 10 seconds.
2019-11-13 13:35:38 +02:00
Aliaksandr Valialkin
87b39222be Revert "lib/fs: do not postpone directory removal on NFS error"
This reverts commit 21aeb02b46649ac9906cb37733f7b155a77a0db9.
2019-11-12 16:29:50 +02:00
Mike Poindexter
955a592106 Add test for invalid caching of tsids (#232)
* Add test for invalid caching of tsids

* Clean up error handling
2019-11-12 15:52:46 +02:00
Roman Khavronenko
ce8cc76a42 add links and fix cache metric name (#233) 2019-11-12 15:06:56 +02:00
Aliaksandr Valialkin
6afb7a50a9 deployment/docker: upgrade Grafana release from v6.4.3 to v6.4.4 2019-11-12 03:50:54 +02:00
Aliaksandr Valialkin
5b677a57e3 deployment/docker: upgrade Go from v1.13.4 to v1.13.4 2019-11-12 03:49:07 +02:00
Aliaksandr Valialkin
d420871d79 deployment/docker: upgrade docker image tag from v1.28.3-cluster to v1.29.0-cluster 2019-11-12 03:44:45 +02:00
Aliaksandr Valialkin
584d8362c8 deployment: update Prometheus from v2.13.0 to v2.14.0 2019-11-12 03:43:59 +02:00
Roman Khavronenko
828f0a2a4b prepare dashboard for external sharing (#231) 2019-11-12 00:23:24 +02:00
Oleg Kovalov
74ba42d111 fix misspelled words (#229) 2019-11-12 00:18:24 +02:00
Aliaksandr Valialkin
c48e39eea9 lib/storage: add tests for dateMetricIDCache 2019-11-11 13:21:05 +02:00
Aliaksandr Valialkin
bdc9045485 README.md: mention that replication doesnt save from disaster 2019-11-11 00:58:08 +02:00
Aliaksandr Valialkin
01801e9e03 dashboards: there will no 1.28.4 release. It will be 1.29.0 2019-11-10 22:05:10 +02:00
Aliaksandr Valialkin
6bdde0d6d4 lib/storage: eliminate data race when updating lastSyncTime in dateMetricIDCache.Has 2019-11-10 22:04:23 +02:00
Roman Khavronenko
7247a7862d add description, churn rate panel, storage.ingestion rate panel (#228) 2019-11-10 20:32:10 +02:00
Aliaksandr Valialkin
5f52eb7653 lib/fs: do not postpone directory removal on NFS error
Continue trying to remove NFS directory on temporary errors for up to a minute.

The previous async removal process breaks in the following case during VictoriaMetrics start

- VictoriaMetrics opens index, finds incomplete merge transactions and starts replaying them.
- The transaction instructs removing old directories for parts, which were already merged into bigger part.
- VictoriaMetrics removes these directories, but their removal is delayed due to NFS errors.
- VictoriaMetrics scans partition directory after all the incomplete merge transactions are finished
  and finds directories, which should be removed, but weren't still removed due to NFS errors.
- VictoriaMetrics panics when it finds unexpected empty directory.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
2019-11-10 13:27:16 +02:00
Aliaksandr Valialkin
9ea2bd822e lib/storage: implement per-day inverted index 2019-11-10 00:20:32 +02:00
Aliaksandr Valialkin
5d8de72414 app/vmrestore: the upcoming release would be 1.29.0 2019-11-10 00:20:18 +02:00
Aliaksandr Valialkin
dea2f3efed lib/storage: use specialized cache for (date, metricID) entries
This improves ingestion performance.
2019-11-09 23:09:18 +02:00
Aliaksandr Valialkin
9a43902bd8 lib/storage: remove unused code from getMetricIDsForTimeRange: it is expected that time range is always non-zero 2019-11-09 19:03:51 +02:00
Aliaksandr Valialkin
c16e17dede lib/storage: properly set time range when deleting time series 2019-11-09 18:50:02 +02:00
Aliaksandr Valialkin
8126007c15 lib/storage: obtain all the time series ids from (tag->metricIDs) rows instead of (metricID->TSID) rows, since this much faster 2019-11-09 18:04:26 +02:00
Aliaksandr Valialkin
50773348d3 lib/storage: small code prettifying 2019-11-09 14:01:24 +02:00
Aliaksandr Valialkin
44fa8226df lib/uint64set: remove superflouos check for item existence before deleting it in Set.Subtract 2019-11-09 14:01:24 +02:00
Aliaksandr Valialkin
0bc54c23ce lib/storage: inmemoryInvertedIndex prettifying 2019-11-09 14:01:24 +02:00
Aliaksandr Valialkin
46e67bb78c lib/storage: export vm_new_timeseries_created_total metric for determining time series churn rate 2019-11-08 19:58:21 +02:00
Aliaksandr Valialkin
0063c857f5 lib/storage: add inmemory inverted index for the last hour
It should improve performance for `last N hours` dashboards with update intervals smaller than 1 hour.
2019-11-08 19:37:46 +02:00
Aliaksandr Valialkin
33abbec6b4 app/vmselect/promql: adjust memory limits calculations for incremental aggregate functions
Incremental aggregate functions don't keep all the selected time series in memory -
they keep only up to GOMAXPROCS time series for incremental aggregations.

Take into account that the number of time series in RAM can be higher if they are split
into many groups with `by (...)` or `without (...)` modifiers.

This should reduce the number of `not enough memory for processing ... data points` false
positive errors.
2019-11-08 19:37:43 +02:00
Aliaksandr Valialkin
7d7fbf890e app/{vmbackup,vmrestore}: add vmbackup and vmrestore tools for creating backups on s3 or gcs from instant snapshots
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/203
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/38
2019-11-07 21:26:43 +02:00
Roman Khavronenko
4e7a2a41a4 Cluster dashboard (#222)
* add dashboard for cluster version

* fix queries and panels

* review fixes

* use resident memory for memory usage panel

* fix job selectors
2019-11-07 12:09:27 +02:00
Aliaksandr Valialkin
89c03a5464 lib/storage: populate partition names from both small and big directories
Certain partition directories may be missing after restoring from backups
if they had no data. Re-create such directories on start.
2019-11-06 19:50:21 +02:00
Aliaksandr Valialkin
1c777e0245 lib/storage: substitute error message about unsorted items in the index block after metricIDs merge with counter
The origin of the error has been detected and documented in the code,
so it is enough to export a counter for such errors at `vm_index_blocks_with_metric_ids_incorrect_order_total`,
so it could be monitored and alerted on high error rates.

Export also the counter for processed index blocks with metricIDs - `vm_index_blocks_with_metric_ids_processed_total`,
so its' rate could be compared to `rate(vm_index_blocks_with_metric_ids_incorrect_order_total)`.
2019-11-06 14:32:41 +02:00
Aliaksandr Valialkin
c567a4353a lib/storage: take into account the requested time range when caching TSIDs for the given tag filters 2019-11-06 14:32:41 +02:00
Aliaksandr Valialkin
c6564c5d26 lib/storage: dump incorrectly sorted items on a single line; this should simplify error reporting 2019-11-05 18:41:50 +02:00
Aliaksandr Valialkin
2ef5082ead deployment/docker: update docker images from v1.28.2-cluster to v1.28.3-cluster 2019-11-05 18:08:50 +02:00
Aliaksandr Valialkin
a10c4cad85 lib/storage: return back finalPartsToMerge from 2 to 3 in order to prevent from excessive merges in old partitions 2019-11-05 17:28:57 +02:00
Aliaksandr Valialkin
e5b1fa0c38 lib/storage: separate the max inverted index scan loops per metric into fast and slow loops
Slow loops could require seeks and expensive regexp matching, while fast loops just scans
all the metricIDs for the given `tag=value` prefix. So these operations must have separate
max loops multiplier.
2019-11-05 17:28:57 +02:00
Aliaksandr Valialkin
f93c4f2493 lib/storage: skip repeated useless work when intersection of metricIDs with the given filter is too expensive
This should improve performance for query filters over big number of time series.
2019-11-05 14:35:55 +02:00
Aliaksandr Valialkin
f48e97263c lib/storage: reduce the maximum inverted index scans before giving up to label filters matching by metric name
The new value reduces the amount of wasted work during index scans over big number of time series.
2019-11-05 14:35:53 +02:00
Aliaksandr Valialkin
d2f688c550 lib/storage: try potentially faster tag filters at first, then apply slower tag filters
The fastest tag filters are non-negative non-regexp, since they are the most specific.
The slowest tag filters are negative regexp, since they require scanning
all the entries for the given label.
2019-11-05 14:35:48 +02:00
Aliaksandr Valialkin
a72b22a8b1 Makefile: add pprof-cpu rule for inspecting CPU profiles with PPROF_FILE=/path/to/cpu.pprof make pprof-cpu 2019-11-04 12:43:57 +02:00
Aliaksandr Valialkin
2a38d30f93 lib/storage: pass pointer to MetricName in Fatalf, so it is properly detected as an interface with String() method
This fixes lint errors
2019-11-04 01:06:45 +02:00
Artem Navoiev
e05500cbd4 add unittests for bytesutil and storage (#221) 2019-11-04 00:57:24 +02:00
Aliaksandr Valialkin
f5fbc3ffd7 lib/{storage,uint64set}: add Set.Union() function and use it 2019-11-04 00:48:32 +02:00
Aliaksandr Valialkin
23e078261e lib/storage: tune the returned value from adjustMaxMetricsAdaptive 2019-11-04 00:45:28 +02:00
Aliaksandr Valialkin
386c349c8c lib/storage: remove interface conversion in hot path during block merging
This should improve merge speed a bit for parts with big number of small blocks.
2019-11-03 12:33:48 +02:00
Aliaksandr Valialkin
26ffc77622 lib/{storage,mergeset}: create missing partition directories after restoring from backups
Backup tools could skip empty directories. So re-create such directories on the first run.
2019-11-02 02:27:19 +02:00
Aliaksandr Valialkin
5d439cc6f2 lib/{decimal,encoding}: optimize float64<->decimal conversion for arrays with zeros or ones
Time series with only zeros or ones frequently occur in monitoring, so it is worth optimizing their handling.
2019-11-01 16:46:08 +02:00
Aliaksandr Valialkin
1037053fed lib/{encoding,decimal}: add benchmarks for blocks containing zeros or ones
Time series with such values are quite common in monitoring space,
so it would be great to have benchmarks for them.
2019-11-01 16:46:08 +02:00
Aliaksandr Valialkin
46b8e13d8c deployment/docker: update image tag from v1.28.1-cluster to v1.28.2-cluster 2019-11-01 16:33:46 +02:00
Aliaksandr Valialkin
44fab198e2 lib/uint64set: return an emptry set instead of nil set from Set.Clone, since the caller may add data to the cloned set
This fixes the following panic in v1.28.1:

panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x10 pc=0x783a7e]

goroutine 1155 [running]:
github.com/VictoriaMetrics/VictoriaMetrics/lib/uint64set.(*Set).Add(0x0, 0x15b3bfb41e8b71ec)
  github.com/VictoriaMetrics/VictoriaMetrics@/lib/uint64set/uint64set.go:57 +0x2e
github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*indexSearch).getMetricIDsForRecentHours(0xc5bdc0dd40, 0x16e273f6b50, 0x16e2745d3f0, 0x5b8d95, 0x10, 0x4a2f51, 0xaa01000000000000)
  github.com/VictoriaMetrics/VictoriaMetrics@/lib/storage/index_db.go:1951 +0x260
github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*indexSearch).getMetricIDsForTimeRange(0xc5bdc0dd40, 0x16e273f6b50, 0x16e2745d3f0, 0x5b8d95, 0x10, 0xb296c0, 0xc00009cd80, 0x9bc640)
2019-11-01 16:12:21 +02:00
Aliaksandr Valialkin
4a8251feff app/vmselect/promql: add lag(q[d]) function, which returns the lag between the current timestamp and the timstamp for the last data point in q 2019-11-01 12:21:43 +02:00
Aliaksandr Valialkin
bd065aad5e deployment/docker: update docker images from v1.28.0-cluster to v1.28.1-cluster 2019-10-31 17:03:27 +02:00
Aliaksandr Valialkin
6ab9c98a1e app/vmstorage: add -bigMergeConcurrency and -smallMergeConcurrency flags for tuning the maximum number of CPU cores used during merges 2019-10-31 16:17:29 +02:00
Aliaksandr Valialkin
6a22727676 lib/storage: optimize getMetricIDsForRecentHours for per-tenant lookups 2019-10-31 15:51:09 +02:00
Aliaksandr Valialkin
ca480915ca lib/storage: small cleanup in Storage.add 2019-10-31 14:30:22 +02:00
Aliaksandr Valialkin
22030b558f lib/decimal: speed up FromFloat for common case with integers 2019-10-31 13:25:09 +02:00
Aliaksandr Valialkin
6510258a80 lib/decimal: increase float64->decimal conversion precision a bit
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/213
2019-10-30 02:04:28 +02:00
Aliaksandr Valialkin
a27e034a40 lib/storage: get parts to merge after applying the limit on the number of concurrent merges
This should reduce write amplification under high ingestion rate.
2019-10-30 00:09:44 +02:00
Aliaksandr Valialkin
5d2276dbf7 lib/{mergeset,storage}: limit the maximum number of concurrent merges; leave smaller number of parts during final merge 2019-10-29 12:45:37 +02:00
Aliaksandr Valialkin
78166cc478 vendor: update github.com/VictoriaMetrics/fastcache from v1.5.1 to v1.5.2 2019-10-29 11:31:36 +02:00
Aliaksandr Valialkin
f581b2736a lib/fs: typo fix in comment to WriteFileAtomically 2019-10-29 11:31:34 +02:00
Aliaksandr Valialkin
a638c6d4f8 vendor: make vendor-update 2019-10-28 13:41:13 +02:00
Roman Khavronenko
1750ee1575 * #215: update klauspost/compress lib
* #215: bump klauspost/compress lib to 1.9.1
2019-10-28 13:41:10 +02:00
Aliaksandr Valialkin
eb513e7ba3 lib/decimal: increase float->decimal conversion precision for big numbers
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/213
2019-10-28 13:23:54 +02:00
Aliaksandr Valialkin
4e6bf6f538 app/vmselect: add -search.latencyOffset flag for tuning the time after data collection when data points become visible in query results
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/218
2019-10-28 12:32:36 +02:00
Aliaksandr Valialkin
121be98325 deployment/docker: upgrade Go builder from go1.13.1 to go1.13.3 2019-10-20 23:49:18 +03:00
hanzai
52778da1f3 warns during rows addition (#214) 2019-10-20 23:38:51 +03:00
Aliaksandr Valialkin
6823aaaf08 README.md: add capacity planning chapter 2019-10-19 10:48:00 +03:00
Aliaksandr Valialkin
78fc35c9b1 all: make fmt 2019-10-17 20:05:12 +03:00
Aliaksandr Valialkin
88d793305d Makefile: disable structcheck in golangci-lint, since it gives false positive on embedded structs 2019-10-17 20:00:17 +03:00
Aliaksandr Valialkin
5b01b7fb01 all: add support for GOARCH=386 and fix all the issues related to 32-bit architectures such as GOARCH=arm
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/212
2019-10-17 18:27:49 +03:00
Aliaksandr Valialkin
5d2af2cfa2 vendor: update github.com/valyala/quicktemplate from v1.2.0 to v1.3.1 2019-10-17 18:27:49 +03:00
Aliaksandr Valialkin
12c8afc3f2 lib/memory: properly handle int overflow in sysTotalMemory
This should fix builds on 32-bit architectures such as arm.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/212
2019-10-17 18:27:49 +03:00
Aliaksandr Valialkin
7d7d7a7d4e deployment/docker/docker-compose.yml: update Prometheus from v2.12.0 to v2.13.0 2019-10-16 12:39:17 +03:00
Aliaksandr Valialkin
e0109fc316 deployment/docker/docker-compose.yml: update VictoriaMetrics image from v1.27.0-cluster to v1.28.0-cluster 2019-10-16 12:39:17 +03:00
Aliaksandr Valialkin
469d169a5d README.md: mention our Slack 2019-10-16 12:31:53 +03:00
Aliaksandr Valialkin
99786c2864 app/vmselect/prometheus: add -search.maxLookback command-line flag for overriding dynamic calculations for max lookback interval
This flag is similar to `-search.lookback-delta` if set. The max lookback interval is determined dynamically
from interval between datapoints for each input time series if the flag isn't set.

The interval can be overriden on per-query basis by passing `max_lookback=<duration>` query arg to `/api/v1/query` and `/api/v1/query_range`.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/209
2019-10-15 21:37:17 +03:00
Aliaksandr Valialkin
ce266d157d README.md: mention contact email for consulting and support 2019-10-15 00:11:39 +03:00
Aliaksandr Valialkin
dc2f822577 lib/prompb: removed outdated README.md 2019-10-14 22:16:36 +03:00
Aliaksandr Valialkin
8ecdb04b7c Makefile: remove obsolete Makefile include from /helm/ directory 2019-10-13 23:22:54 +03:00
Aliaksandr Valialkin
92e0ca6bbf vendor: make vendor-update 2019-10-13 23:18:28 +03:00
Artem Navoiev
75504747c8 Update README.md change helm section 2019-10-13 23:05:37 +03:00
Artem Navoiev
3d3d87f718 [deployment] remove helm chart 2019-10-13 23:03:52 +03:00
Aliaksandr Valialkin
bf6fe234b2 README.md: mention about delete_series handler
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/207
2019-10-10 02:09:38 +03:00
Aliaksandr Valialkin
f1a7965676 README.md: refer to comment about ingestion rate scalability
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175
2019-10-09 17:28:25 +03:00
Aliaksandr Valialkin
7b6570489a README.md: add a few words about scalability 2019-10-09 13:01:59 +03:00
Aliaksandr Valialkin
661b8ede5b lib/storage: harden the check that the original items are sorted after mergeTagToMetricIDsRows fails to preserve sort order 2019-10-09 12:13:43 +03:00
Aliaksandr Valialkin
7f4a04ee6a lib/storage: typo fix in comment to maxRowsPerSmallPart. 2019-10-08 18:51:56 +03:00
Aliaksandr Valialkin
7e410e1412 lib/storage: add tests for mergeTagToMetricIDsRows and return the original items if the function breaks items` ordering.
This should save from data corruption issues revealed in the previous releases up to v1.28.0-beta5.
2019-10-08 16:35:39 +03:00
Aliaksandr Valialkin
a5302a6651 app/vmselect/promql: take into account the previous point when calculating max_over_time and min_over_time
This lines up with `first_over_time` function used in `rollup_candlestick`, so `rollup=low` always returns
the minimum value.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/204
2019-10-08 12:30:16 +03:00
Aliaksandr Valialkin
95d0f1bfd1 vendor: make vendor-update 2019-10-06 15:48:45 +03:00
Aliaksandr Valialkin
84b3b29644 README.md: add binaries section with urls to pre-compiled binaries and docker images 2019-10-06 11:42:36 +03:00
Aliaksandr Valialkin
39b18b1dcd vendor: update github.com/VictoriaMetrics/metrics from v1.7.1 to v1.7.2 2019-10-06 11:20:03 +03:00
Stian Ovrevage
ef6e01b1fa Add bool to extraLabels. Fix tls indentation 2019-10-03 21:47:00 +03:00
Stian Ovrevage
4fb63d7d61 Fix helm template indentation 2019-10-03 21:47:00 +03:00
Aliaksandr Valialkin
9fce611fbb lib/mergeset: reduce the maximum number of cached blocks, since there are reports on OOMs due to too big caches
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/189
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/195
2019-09-30 12:27:30 +03:00
Aliaksandr Valialkin
483af3a97a app/vmselect/netstorage: hint the OS that tmpBlocksFile is read almost sequentially
This became the case after b7ee2e7af2 .
2019-09-30 00:13:33 +03:00
Aliaksandr Valialkin
946ca438a6 app/vmselect/netstorage: marshal block outside tmpBlocksFile.WriteBlock
This also allows marshaling outside lock, thus reducing the amount of work under the lock.
2019-09-28 20:57:20 +03:00
Aliaksandr Valialkin
e92e39eddf app/vmselect/netstorage: reduce the number of disk seeks when the query processes big number of time series 2019-09-28 20:57:20 +03:00
Aliaksandr Valialkin
56dff57f77 app/vmselect/netstorage: reduce memory usage when fetching big number of data blocks from vmstorage
Dump data blocks directly to temporary file instead of buffering them in RAM
2019-09-28 12:21:57 +03:00
Aliaksandr Valialkin
ba460f62e6 app/vmselect/promql: do not generate timestamps for NaN values in timestamp function according to Prometheus logic 2019-09-27 18:55:16 +03:00
Stian Øvrevåge
a9dac3829e Update README.md - Fix helm command typos
`victoria-mertrics` -> `victoria-metrics` in helm commands.
2019-09-27 12:33:48 +03:00
Aliaksandr Valialkin
de919574a5 deployment/docker: switch Go builder image from v1.13.0 to v1.13.1 2019-09-26 17:12:52 +03:00
Aliaksandr Valialkin
d0b4590099 lib/storage: optimize TSID comparison 2019-09-26 14:20:02 +03:00
Aliaksandr Valialkin
95e3d648cb lib/storage: verify whether items are sorted in the end of call to mergeTagToMetricIDsRows
This should prevent from inverted index corruption if bug in mergeTagToMetricIDsRows is discovered.
2019-09-26 13:13:58 +03:00
Aliaksandr Valialkin
2b8358726f lib/storage: properly match labels against regexp with (?i) flag
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/161
2019-09-26 11:03:26 +03:00
Aliaksandr Valialkin
bd1cf053f6 app/vmselect/promql: add increases_over_time and decreases_over_time functions
`increases_over_time(q[d])` returns the number of `q` increases during the given duration `d`.
`decreases_over_time(q[d])` returns the number of `q` decreases during the given duration `d`.
2019-09-25 20:38:51 +03:00
Aliaksandr Valialkin
4e3871ac1e lib/storage: add missing break in removeDuplicateMetricIDs 2019-09-25 18:23:13 +03:00
Aliaksandr Valialkin
4468f9f966 lib/storage: remove duplicate MetricIDs in tag->metricIDs items before writing them into inverted index 2019-09-25 17:57:36 +03:00
Aliaksandr Valialkin
adc18c3ee6 lib/{mergeset,storage}: do not cache inverted index blocks containing tag->metricIDs items
This should reduce the amounts of used RAM during queries with filters over big number of time series.
2019-09-25 13:48:24 +03:00
Aliaksandr Valialkin
8d398af92f app/vminsert/netstorage: mention the data size that cannot be sent to vmstorage
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175
2019-09-25 12:53:41 +03:00
Aliaksandr Valialkin
73ac7b8dd6 app/vminsert/netstorage: make sure the conn exists before closing it in storageNode.closeBrokenConn
The conn can be missing or already closed during the call to storageNode.closeBrokenConn.
Prevent `nil pointer dereference` panic by verifying whether the conn is already closed.

Thanks to @CH-anhngo for reporting the issue.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/189
2019-09-25 10:36:50 +03:00
Aliaksandr Valialkin
c64fb91a43 lib/uint64set: optimize Set.AppendTo 2019-09-25 00:34:31 +03:00
Aliaksandr Valialkin
de0e4eee2c lib/storage: create and use lib/uint64set instead of map[uint64]struct{}
This should improve inverted index search performance for filters matching big number of time series,
since `lib/uint64set.Set` is faster than `map[uint64]struct{}` for both `Add` and `Has` calls.
See the corresponding benchmarks in `lib/uint64set`.
2019-09-24 21:18:04 +03:00
Aliaksandr Valialkin
2212d0e421 lib/storage: typo fix: return dstData instead of data from mergeTagToMetricIDsRows 2019-09-24 19:32:58 +03:00
Aliaksandr Valialkin
9307de1b92 lib/storage: limit the number of metricIDs in tag->metricIDs row
This reduces the overhead on index and metaindex in lib/mergeset
2019-09-24 00:50:47 +03:00
Aliaksandr Valialkin
7734fc8012 lib/storage: share tsids across all the partSearch instances
This should reduce memory usage when big number of time series matches the given query.
2019-09-23 22:36:16 +03:00
Aliaksandr Valialkin
67a2bcb98a lib/{storage,mergeset}: verify PrepareBlock callback results
Do not touch the first and the last item passed to PrepareBlock
in order to preserve sort order of mergeset blocks.
2019-09-23 20:46:33 +03:00
Aliaksandr Valialkin
3304dc1e85 lib/mergeset: detect whether we are in test by executable suffix 2019-09-22 23:12:35 +03:00
Aliaksandr Valialkin
d2ed8cb0b2 lib/storage: generate the first tag->metricIDs item in a mergeset block with a single metricID
The first item from each mergeset block goes into index (lib/mergeset.blockHeader),
so it must be short in order to reduce index size.
2019-09-22 19:37:50 +03:00
Aliaksandr Valialkin
0a9cb6368e lib/workingsetcache: remove data race when resetting c.misses 2019-09-22 19:37:09 +03:00
Aliaksandr Valialkin
7d13c31566 lib/{storage,mergeset}: merge tag->metricID rows into tag->metricIDs rows for common tag values
This should improve lookup performance if the same `label=value` pair exists
in big number of time series.
This should also reduce memory usage for mergeset data cache, since `tag->metricIDs` rows
occupy less space than the original `tag->metricID` rows.
2019-09-20 22:06:23 +03:00
Aliaksandr Valialkin
272e2f77c9 lib/encoding: optimize UnmarshalUint* and UnmarshalInt* 2019-09-20 13:08:24 +03:00
Aliaksandr Valialkin
7e0c6d4ca6 lib/storage: optimize selecting all the metricIDs by scanning MetricID->TSID entries instead of tag->MetricID entries
The number of MetricID->TSID entries is smaller than the number of tag->MetricID entries
and MetricID->TSID entries are usually shorter than tag->MetricID entries.
This should improve performance when selecting all the metricIDs.
2019-09-20 11:57:57 +03:00
Aliaksandr Valialkin
b0c738ae8b app/vminsert/opentsdbhttp: remove FATAL prefix from logger.Fatalf errors for the sake of consistency with other logger.Fatalf calls 2019-09-19 22:16:11 +03:00
Aliaksandr Valialkin
bf8505353a lib/mergeset: rename misleading mergeSmallParts to mergeExistingParts 2019-09-19 21:48:36 +03:00
Aliaksandr Valialkin
ebbef20535 lib/mergeset: use sort.IsSorted instead of sort.SliceIsSorted in inmemoryBlock.isSorted in order to reduce memory allocations 2019-09-19 20:13:54 +03:00
Aliaksandr Valialkin
89234f395d lib/storage: use sort.Sort instead of sort.slice in getSortedMetricIDs 2019-09-19 20:08:13 +03:00
Aliaksandr Valialkin
6e586fa09c lib/storage: skip duplicate call to intersectMetricIDsWithTagFilter on zero successful intersects 2019-09-19 17:51:10 +03:00
Aliaksandr Valialkin
410f993bf6 lib/mergeset: fill partHeader.firstItem on first block flush 2019-09-19 17:48:22 +03:00
Aliaksandr Valialkin
c05885fb5f lib/storage: mark tag filter returning errFallbackToMetricNameMatch as useless
This will save CPU on subsequent calls for this filter
2019-09-18 19:11:44 +03:00
Aliaksandr Valialkin
e041a196a7 deployment/docker/docker-compose.yml: update Prometheus from v2.3.2 to v2.12.0 2019-09-18 18:30:02 +03:00
Aliaksandr Valialkin
db71c940ea lib/storage: properly construct keys for uselessTagFiltersCache and register useless negative tag filters there 2019-09-17 23:18:37 +03:00
Artem Navoiev
ccb6dc6925 [deployment] clean up helm (#185) 2019-09-16 21:58:16 +03:00
Aliaksandr Valialkin
491b1317f4 vendor: update github.com/valyala/gozstd from v1.6.1 to v1.6.2 2019-09-16 21:50:02 +03:00
Aliaksandr Valialkin
5666112de2 deployment: switch docker image tag from v1.27.2-cluster to v1.27.3-cluster 2019-09-14 11:33:18 +03:00
Aliaksandr Valialkin
ba21622b78 vendor: make vendor-update 2019-09-13 22:49:34 +03:00
Aliaksandr Valialkin
020341d13a deployment/docker: remove file system paths from the compiled binary 2019-09-13 22:46:07 +03:00
Aliaksandr Valialkin
550a12415a app/vminsert/netstorage: log network errors when sending data to vmstorage nodes
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175
2019-09-13 22:26:24 +03:00
Aliaksandr Valialkin
41ef6b060e lib/mergeset: properly check for sorted block headers
Fix a typo for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/181
2019-09-13 21:59:38 +03:00
Aliaksandr Valialkin
ee4585db33 app/vmselect/promql: properly handle subqueries like aggr_func(rollup_func(metric[window:step]))
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/184
2019-09-13 21:42:11 +03:00
hanzai
08cde5e3f4 lib/workingsetcache: adjust switching from mode=split to mode=whole smoothly and load cachefile successfully 2019-09-13 19:13:16 +03:00
Aliaksandr Valialkin
828e5f6d26 app/vmselect/promql: binary operation fixes according to Prometheus behaviour
The follosing issues were fixed:
- VictoriaMetrics could leave superflouos labels when using `on` or `ignoring` modifiers
- VictoriaMetrics could return `duplicate timeseries` error when using `group_left` or `group_right` with non-empty label list
2019-09-13 17:43:09 +03:00
Artem Navoiev
62b424bc4c [ci] github actions - run pipeline on pull request. Fix running of test in external PR from forks 2019-09-11 14:54:45 +03:00
Aliaksandr Valialkin
ed50b8792b app/vminsert/netstorage: reduce the maximum buffer size for rerouted rows, so it occupies less RAM 2019-09-11 14:50:30 +03:00
Aliaksandr Valialkin
b101064f8b all: report the number of bytes read on io.ReadFull error
This should simplify error investigation similar to https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175
2019-09-11 14:50:24 +03:00
Aliaksandr Valialkin
2f4c950fe9 app/vminsert/netstorage: send per-storageNode bufs to vmstorage nodes in parallel
This should improve the maximum ingestion throughput.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175
2019-09-11 14:50:19 +03:00
Aliaksandr Valialkin
694cc59ed1 app/vminsert/netstorage: dynamically adjust timeouts for sending packets from vminsert to vmstorage depending on packet size
Bigger packets will have more chances to be sent to vmstorage.
2019-09-11 14:50:14 +03:00
Aliaksandr Valialkin
568ff61dcf lib/mergeset: dynamically calculate the maximum number of items per part, which can be cached in OS page cache 2019-09-09 11:42:45 +03:00
Artem Navoiev
dc6e4151b0 [ci] bump version of go to 1.13 in github actions config 2019-09-08 19:52:05 +03:00
Aliaksandr Valialkin
9b8af27786 vendor: update github.com/klauspost/compress from v1.7.6 to v1.8.2 2019-09-06 00:49:57 +03:00
Aliaksandr Valialkin
b71d828e84 vendor: update golang.org/x/sys 2019-09-06 00:49:57 +03:00
Aliaksandr Valialkin
1f4e0b722d deployment: switch docker image tag from v1.27.1-cluster to v1.27.2-cluster 2019-09-05 12:30:03 +03:00
Aliaksandr Valialkin
2c654258ef lib/fs: add MustStopDirRemover for waiting until pending directories are removed on graceful shutdown
This patch is mainly required for laggy NFS. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
2019-09-05 11:17:17 +03:00
Aliaksandr Valialkin
d0953e9f02 app/vmselect/promql: ignore grouping by destination label in count_values, since such a grouping is performed automatically 2019-09-04 19:59:02 +03:00
Aliaksandr Valialkin
2c2bd897dd lib/storage: remove duplicate tag keys on MetricName.Marshal call
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/172
2019-09-04 18:13:51 +03:00
Aliaksandr Valialkin
5a9b1d85bb deployment/docker: switch Go builder from Go 1.12.9 to Go 1.13.0 2019-09-04 17:17:52 +03:00
Aliaksandr Valialkin
f78ffe565f app/vmselect/promql: do not return artificial points beyond the last point in time series 2019-09-04 16:34:29 +03:00
Aliaksandr Valialkin
a7d5d611fe app/vmselect/prometheus: do not adjust start and end args in /api/v1/query_range if nocache=1 arg is set
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/171
2019-09-04 13:10:17 +03:00
Aliaksandr Valialkin
82bfe818d0 lib/fs: try harder with directory removal on NFS in the event of temporary lock
Do not give up after 11 attempts of directory removal on laggy NFS.

Add `vm_nfs_dir_remove_failed_attempts_total` metric for counting the number of failed attempts
on directory removal.

Log failed attempts on directory removal after long sleep times.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
2019-09-04 12:24:41 +03:00
Alfred Krohmer
7cde25bac4 Don't render extraLabels in ServiceMonitor if they are not given
This produced invalid YAML before.
2019-09-03 22:11:24 +03:00
Aliaksandr Valialkin
3182e2a66b deployment: update docker images from v1.27.0-cluster to v1.27.1-cluster 2019-09-03 21:05:50 +03:00
Aliaksandr Valialkin
b08f085082 app/vmselect/promql: reset timeseries name on group_left and group_right as Prometheus does 2019-09-03 20:43:29 +03:00
Aliaksandr Valialkin
458d412bb6 app/vmselect/netstorage: adaptively adjust the maximum inmemory file size for storing temporary blocks
The maximum inmemory file size now depends on `-memory.allowedPercent`.
This should improve performance and reduce the number of filesystem calls
on machines with big amounts of RAM when performing heavy queries
over big number of samples and time series.
2019-09-03 13:32:18 +03:00
Aliaksandr Valialkin
0b0153ba3d lib/storage: invalidate tagFilters -> TSIDS cache when newly added index data becomes visible to search
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/163
2019-08-29 15:08:44 +03:00
Aliaksandr Valialkin
8504a38214 lib/prombp: apply ba06b47c16
The following commands used:

gofmt -r '(uint64(x)&0x7F)<<shift -> uint64(x&0x7F)<<shift' -w ./lib/prompb/
gofmt -r '(int64(x)&0x7F)<<shift -> int64(x&0x7F)<<shift' -w ./lib/prompb/
2019-08-29 13:35:54 +03:00
Aliaksandr Valialkin
fb719bfb23 deployment: update docker images from v1.26.0-cluster to v1.27.0-cluster 2019-08-29 00:09:51 +03:00
Aliaksandr Valialkin
8f81908b1f .github/workflows: added GitHub actions 2019-08-28 23:11:26 +03:00
Aliaksandr Valialkin
604a4312f9 all: port to FreeBSD on GOARCH=amd64 2019-08-28 01:46:09 +03:00
Aliaksandr Valialkin
5893a9f9a3 app/vmstorage: increase default values for search.maxTagKeys, search.maxTagValues and search.maxUniqueTimeseries 2019-08-27 14:28:26 +03:00
Aliaksandr Valialkin
da07a6fb38 lib/storage: go fmt 2019-08-27 14:28:24 +03:00
Aliaksandr Valialkin
a63b69e9e2 lib/storage: report proper maxMetrics limit when more than -search.maxUniqueTimeseries series match the given filters 2019-08-27 14:21:31 +03:00
Aliaksandr Valialkin
82e813bad3 lib/storage: properly handle (?i) in the tag filter regexp
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/161
2019-08-26 00:44:56 +03:00
Aliaksandr Valialkin
e2eac858b5 lib/storage: calculate the maximum number of rows per small part from -memory.allowedPercent
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/159

This simplifies error detection additionally to the `vm_rows_ignored_total` counters.
2019-08-25 15:29:09 +03:00
Aliaksandr Valialkin
0a8dd9cc9a lib/storage: calculate the maximum number of rows per small part from -memory.allowedPercent
This should improve query speed over recent data on machines with big amounts of RAM
2019-08-25 14:41:32 +03:00
Aliaksandr Valialkin
bc576fb386 lib/storage: properly limit the number of output rows in small and big parts storage
Previously small parts storage didn't take into account the available disk space for big parts.
2019-08-25 14:41:32 +03:00
Aliaksandr Valialkin
947decb3dd lib/storage: remove outdated comment on maxRowsPerSmallPart
The commend became outdated after the commit ed6ac1a5df027f0dfc22448e3b27c26b6f77c67a,
which stops merging of small parts on graceful shutdown instead of waiting
for their completion.
2019-08-25 13:46:10 +03:00
Artem Navoiev
ce7798a6a2 [deployment] add ingresses and service monitors 2019-08-25 01:04:56 +03:00
Aliaksandr Valialkin
38711526d3 app/vminsert/influx: set db label only if Influx line doesnt have db tag 2019-08-24 13:55:01 +03:00
Aliaksandr Valialkin
023675c33e vendor: update github.com/valyala/quicktemplate from v1.1.1 to v1.2.0 2019-08-24 13:39:46 +03:00
Aliaksandr Valialkin
1ee536f9fd app/vminsert: skip empty tags 2019-08-24 13:36:41 +03:00
Aliaksandr Valialkin
a283023d16 app/vminsert/opentsdbhttp: skip invalid rows and continue parsing the remaining rows
Invalid rows are logged and counted in `vm_rows_invalid_total{type="opentsdb-http"}` metric
2019-08-24 13:36:41 +03:00
Aliaksandr Valialkin
38b9615c53 app/vminsert/opentsdb: skip invalid rows and continue parsing the remaining rows
Invalid rows are logged and counted in `vm_rows_invalid_total{type="opentsdb"}` metric
2019-08-24 13:36:41 +03:00
Aliaksandr Valialkin
2a8fc41bab app/vminsert/graphite: skip invalid rows and continue parsing the remaining rows
Invalid rows are logged and counted in `vm_rows_invalid_total{type="graphite"}` metric
2019-08-24 13:36:41 +03:00
Aliaksandr Valialkin
22685ef94d app/vminsert/influx: skip invalid rows and continue parsing the remaining rows
Invalid influx lines are logged and counted in `vm_rows_invalid_total{type="influx"}` metric.
2019-08-24 13:36:41 +03:00
Aliaksandr Valialkin
425a81a6c7 app/vminsert/influx: do not allow escaping newline char, since they dont occur in real life
The prefious report with escaped newline chars in influx line protocol was false alarm.
2019-08-23 18:43:00 +03:00
Aliaksandr Valialkin
8da8dd0876 app/vminsert/opentsdbhttp: allow timestamp as float64 and as string, since it occurs in real life 2019-08-23 18:35:52 +03:00
Aliaksandr Valialkin
0ea21eb9dc app/vminsert/influx: handle \r\n aka crlf influx line endings from windows world
Such lines exist in real life.
2019-08-23 18:28:54 +03:00
Aliaksandr Valialkin
b3502b2b39 app/vminsert/influx: allow escaping newline char
Though newline char isn't mentioned in escape rules at https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/ ,
there are reports that such chars occur in real life
2019-08-23 15:14:58 +03:00
Aliaksandr Valialkin
f1f8fce4f7 app/vminsert/influx: skip comments starting with # in influx line protocol 2019-08-23 14:43:24 +03:00
Aliaksandr Valialkin
697de90893 app/vminsert: do not drop data in reroutedBuf if all the storage nodes are unhealthy 2019-08-23 10:38:19 +03:00
Aliaksandr Valialkin
a5dc54efc3 app/vminsert: properly limit the size of reroutedBuf 2019-08-23 10:29:51 +03:00
Aliaksandr Valialkin
c50975e12d vendor: make vendor-update 2019-08-23 10:03:42 +03:00
Aliaksandr Valialkin
c197641978 all: return 503 http error if service is temporarily unavailable
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/156
2019-08-23 09:49:50 +03:00
Aliaksandr Valialkin
e734076f0f app/vminsert: allow setting the maximum number of labels per time series via -maxLabelsPerTimeseries 2019-08-23 08:47:18 +03:00
Aliaksandr Valialkin
4ed63d033a lib/storage: add benchmarks for regexp filter match / mismatch
These benchmarks allow estimate the performance of regexp filters in promql
2019-08-22 16:37:19 +03:00
Aliaksandr Valialkin
559dd03181 deployment: update docker image tags from v1.25.2-cluster to v1.26.0-cluster 2019-08-22 14:58:58 +03:00
Aliaksandr Valialkin
e9db22a551 app/vmselect/promql: attempt to repair invalid bucket counts passed to histogram_quantile
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/136
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/154
2019-08-22 14:39:24 +03:00
Aliaksandr Valialkin
0697164b4f app/vminsert: add ability to ingest data via HTTP OpenTSDB /api/put requests
This is manual merge of the https://github.com/VictoriaMetrics/VictoriaMetrics/pull/152
Thanks to nustinov@gmail.com for the initial pull request.
2019-08-22 12:46:54 +03:00
Aliaksandr Valialkin
4d555c7c87 app/vminsert/opentsdb: fix BenchmarkRowsUnmarshal by adding missing put prefixes to each line 2019-08-21 19:15:04 +03:00
Aliaksandr Valialkin
90a4b00b10 app/vmselect/promql: fix panic on -search.disableCache
Reset the cache if it is disabled instead of stopping, since it is stopped on graceful shutdown.
2019-08-21 17:12:01 +03:00
Aliaksandr Valialkin
491b1762c8 app/vmselect/promql: explain why empty timeseries arent removed in transformLabelValue 2019-08-21 11:29:41 +03:00
Aliaksandr Valialkin
db1de4277c app/vmselect/promql: remove NaNs from /api/v1/query_range output like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/153
2019-08-20 23:01:59 +03:00
Aliaksandr Valialkin
99331606e1 app/vmselect/promql: pre-allocate memory for map for checking for duplicate timeseries
This should reduce memory allocations for big number of timeseries
2019-08-20 23:01:57 +03:00
Aliaksandr Valialkin
1101765adb app/vmselect/promql: add label_value(q, label_name) func, which returns numeric value labels with name label_name in q 2019-08-20 00:28:44 +03:00
Aliaksandr Valialkin
6ec6a8d7c1 lib/storage: try slower path for searching the tag filter with the minimum number of matching time series before giving up with increase -search.maxUniqueTimeseries error 2019-08-19 16:07:05 +03:00
Aliaksandr Valialkin
940349ccb9 app/vmselect/promql: independently track offset hints for tStart and tEnd
This should improve performance if timeseries starts or ends on the selected time range
2019-08-19 13:40:24 +03:00
Aliaksandr Valialkin
6ae4b4190f app/vmselect/promql: optimize search for timestamp boundaries in rollupConfig.Do
This should improve the performance of queries over big number of time series
with big number of output points.
2019-08-19 13:03:38 +03:00
Aliaksandr Valialkin
c59f5c4865 lib/storage: pre-allocate memory for blockHeader slice in unmarshalBlockHeaders
This reduces memory usage and memory fragmentation when working with big number of time series
2019-08-19 12:46:45 +03:00
Aliaksandr Valialkin
45e57be590 deployment: update docker image tags from v1.25.1-cluster to v1.25.2-cluster 2019-08-18 22:56:11 +03:00
Aliaksandr Valialkin
0f45273e20 deployment/docker: switch Go builder from go1.12.8 to go1.12.9 2019-08-18 22:09:21 +03:00
Aliaksandr Valialkin
005aabd305 app/vmselect/promql: add scrape_interval(q[d]) function, which would return scrape interval for q over d 2019-08-18 21:08:15 +03:00
Aliaksandr Valialkin
218cb4623a app/vmselect/promql: hande comparisons with NaN similar to Prometheus
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/150
2019-08-18 00:25:58 +03:00
Aliaksandr Valialkin
dcce92c63c app/vmselect/promql: add lifetime(q[d]) function, which returns the lifetime of q over d in seconds.
This function is useful for determining time series lifetime.
`d` must exceed the expected lifetime of the time series, otherwise
the function would return values close to `d`.
2019-08-16 11:59:51 +03:00
Aliaksandr Valialkin
0cb66a8f95 app/vmselect/promql: fix corner-case calculation for ideriv 2019-08-16 11:59:50 +03:00
Aliaksandr Valialkin
1b5b9ced27 app/vmselect/promql: properly handle corner cases for rollup functions 2019-08-15 23:31:28 +03:00
Aliaksandr Valialkin
f696cc503a lib/workingsetcache: automatically detect when it is better to double cache capacity 2019-08-15 22:58:04 +03:00
Aliaksandr Valialkin
97634d7101 deployment/docker: switch Go builder from go1.12.7 to go1.12.8 2019-08-15 20:43:23 +03:00
Aliaksandr Valialkin
e6541a7676 deployment: update docker images 2019-08-15 14:18:43 +03:00
Aliaksandr Valialkin
e399b948de Makefile: remove duplicate -cluster suffix from tar.gz file generated by make release, since this suffix must be already present in PKG_TAG 2019-08-15 14:07:43 +03:00
Aliaksandr Valialkin
1dd736a75c Makefile: add make release rule for building release tar.gz file with cluster binaries
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/146
2019-08-15 14:05:56 +03:00
Aliaksandr Valialkin
c15dfc6cea vendor: update github.com/valyala/gozstd from v1.5.1 to v1.6.0 2019-08-15 12:57:59 +03:00
Aliaksandr Valialkin
83ed5d3109 deployment: update docker images 2019-08-14 03:12:49 +03:00
Aliaksandr Valialkin
99eed2ca14 lib/storage: properly cache tagFilters -> TSIDs entries from historical index 2019-08-14 02:32:25 +03:00
Aliaksandr Valialkin
f1d81b9405 lib/storage: compress contents of cache for tagFilters -> TSIDs
This should increase cache capacity
2019-08-14 02:32:22 +03:00
Aliaksandr Valialkin
b8bbe92de1 app/vmselect/promql: store compressed results in the cache
This should increase rollup results cache capacity.
2019-08-14 02:32:16 +03:00
Aliaksandr Valialkin
8c2158af24 all: use workingsetcache instead of fastcache
This should reduce the amount of RAM required for processing time series
with non-zero churn rate.

The previous cache behavior can be restored with `-cache.oldBehavior` command-line flag.
2019-08-13 21:40:28 +03:00
Aliaksandr Valialkin
51263b1a45 lib/fs: add test for IsTemporaryFileName 2019-08-13 21:33:54 +03:00
Aliaksandr Valialkin
867612a4a4 Makefile: consistency renaming: check_all -> check-all 2019-08-13 21:32:08 +03:00
Aliaksandr Valialkin
5a7ab0d90b lib/storage: remove broken BenchmarkIndexDBSearchTSIDs 2019-08-13 20:21:23 +03:00
Aliaksandr Valialkin
39f3f3a517 lib: move common code for creating flock.lock file into fs.CreateFlockFile 2019-08-13 01:46:20 +03:00
Aliaksandr Valialkin
73f866d874 lib/fs: atomically create file with the given contents on WriteFileAtomically
This should prevent from `transaction` and `metadata.json` files corruption
on unclean shutdown such as OOM, `kill -9`, power loss, etc.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/148
2019-08-12 15:02:04 +03:00
Aliaksandr Valialkin
ad5be625f8 deployment: update docker images 2019-08-06 16:10:03 +03:00
Aliaksandr Valialkin
4fb635b0c9 lib/storage: do not change timestamps to constant rate if values are constant or have constant delta
This breaks the original timestamps, which results in issues like
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/120 and
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/141 .
2019-08-06 15:40:17 +03:00
Aliaksandr Valialkin
f56c1298ad app/vmstorage: add vm_concurrent_addrows_* metrics for tracking concurrency for Storage.AddRows calls
Track also the number of dropped rows due to the exceeded timeout
on concurrency limit for Storage.AddRows. This number is tracked in `vm_concurrent_addrows_dropped_rows_total`
2019-08-06 15:08:43 +03:00
Aliaksandr Valialkin
2d869c6d9b vendor: update github.com/VictoriaMetrics/metrics to v1.7.1 2019-08-05 19:21:53 +03:00
Aliaksandr Valialkin
8e05758ff5 app: add vm_concurrent_ metrics for visibility in concurrency limiters for vminsert and vmselect 2019-08-05 18:30:29 +03:00
Aliaksandr Valialkin
1258c9ef10 vendor: make vendor-update 2019-08-05 10:34:38 +03:00
Aliaksandr Valialkin
a3ecf3c1f7 lib/storage: properly reset partSearch.fetchData in partSearch.reset 2019-08-05 09:55:50 +03:00
Artem Navoiev
dd4ea63ed2 [deployment] add statefulset for vmselect (#140) 2019-08-04 23:34:05 +03:00
Aliaksandr Valialkin
a868f8607f deployment: update docker images to v1.24.0-cluster 2019-08-04 23:31:57 +03:00
Aliaksandr Valialkin
53c8f56436 app/vmselect: allow passing match[], start and time to /api/v1/label/<label_name>/values
`/api/v1/label/<label_name>/values?match[]=q` emulates emulates `label_values(q, <label_name>)`
call in Grafana templating.
2019-08-04 23:07:00 +03:00
Aliaksandr Valialkin
880b1d80b1 app/vmselect: optimize /api/v1/series by skipping storage data
Fetch and process only time series metainfo.
2019-08-04 23:00:46 +03:00
Aliaksandr Valialkin
7f5afae1e3 app/vmselect/prometheus: prevent from fetching and scanning all the data on /api/v1/searies call by default 2019-08-04 19:42:45 +03:00
Aliaksandr Valialkin
000c154641 app/vmselect/promql: tune automatic window adjustement
Increase the windows adjustement for small scrape intervals,
since they usually have higher jitter.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/139
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/134
2019-08-04 19:34:11 +03:00
Aliaksandr Valialkin
1d4ddadbb1 app/vmselect/promql: further increase the allowed jitter for scrape interval
Real-world production data shows higher jitter than 1/8 of scrape interval.
This may results in gaps on the graph. So increase the allowed jitter to 1/4
of scrape interval in order to reduce the probability of gaps on the graphs
over time series with high jitter for scrape_interval.
2019-08-02 20:16:41 +03:00
Aliaksandr Valialkin
8ed84a4713 app/vminsert/influx: round automatically generated timestamp according to the given precision arg 2019-08-02 00:24:39 +03:00
Aliaksandr Valialkin
ade7bc30db app/vmselect/promql: tolerate higher jitter in scrape interval
Allow jitter for up to 1/8 instead of 1/16 for the scrape interval.
This should imrpove graphs when `step` is smaller than the `scrape_interval`.
2019-08-01 23:25:53 +03:00
Aliaksandr Valialkin
a99e89945e lib/decimal: modernize tests a bit 2019-07-31 21:09:54 +03:00
Aliaksandr Valialkin
6fceedccce deployment: update docker images 2019-07-31 16:38:39 +03:00
Aliaksandr Valialkin
c994fbf500 app/vmselect/promql: add vm_slow_queries_total metric for counting slow queries
The query is slow if its execution time exceeds `-search.logSlowQueryDuration`
2019-07-31 03:36:45 +03:00
Aliaksandr Valialkin
071a122119 app/vmselect/promql: return NaN from histogram_quantile if at least a single bucket is broken 2019-07-31 01:18:11 +03:00
Aliaksandr Valialkin
b9a16b93e7 app/vmselect/promql: allow adjusting window for default rollup function
Default rollup function is `last_over_time`. It must support adjusting
the provided window in order to prevent from gaps on the graph
for window values smaller than scrape interval.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/134
2019-07-31 00:45:58 +03:00
Aliaksandr Valialkin
c901a6472f app/vmselect/promql: return NaN values if invalid bucket counts are passed to histogram_quantile
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/136
2019-07-30 22:05:55 +03:00
Aliaksandr Valialkin
b7c4b0c6d2 lib/storage: fix matching against tag filter with empty name
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/137
2019-07-30 15:15:21 +03:00
Aliaksandr Valialkin
5b8526e925 app/vmselect/netstorage: improve error message when reading data blocks from storage
Mention the block number in the error. This should simplify troubleshooting in this code.
2019-07-28 12:17:33 +03:00
Aliaksandr Valialkin
b7089705b7 app/vminsert: add vm_rows_per_insert summary metric
This metric should help tuning batch sizes on clients writing data to VictoriaMetrics
2019-07-27 13:28:20 +03:00
Aliaksandr Valialkin
1fd4e9fb5c app/vminsert: improve error messages for Influx, OpenTSDB and Graphite parsing
Include in the error message the line which failed to parse.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/127
2019-07-26 22:09:21 +03:00
Aliaksandr Valialkin
34b21a8671 deployment: update cluster image to v1.23.0-cluster 2019-07-26 20:07:47 +03:00
Aliaksandr Valialkin
8253790157 app/vmstorage: consistency renaming for ignored rows metrics
vm_too_big_timestamp_rows_total -> vm_rows_ignored_total{reason="big_timestamp"}
  vm_too_small_timestamp_rows_total -> vm_rows_ignored_total{reason="small_timestamp"}
2019-07-26 20:02:24 +03:00
Aliaksandr Valialkin
c6bec48927 lib/storage: add metrics for calculating skipped rows outside the retention
The metrics are:

    - vm_too_big_timestamp_rows_total
    - vm_too_small_timestamp_rows_total
2019-07-26 14:11:56 +03:00
Aliaksandr Valialkin
aac482517f app/vmselect/promql: return NaN from count() over zero time series
This aligns `count` behavior with Prometheus.
2019-07-25 22:02:34 +03:00
Aliaksandr Valialkin
0e52357f35 app/vmselect/promql: properly calculate incremental aggregations grouped by __name__
Previously the following query may fail on multiple distinct metric names match:

    sum(count_over_time{__name__!=''}) by (__name__)
2019-07-25 21:53:26 +03:00
Aliaksandr Valialkin
f2e8d54fb0 lib/encoding/zstd: go fmt 2019-07-25 01:37:57 +03:00
Aliaksandr Valialkin
97b5dc7122 lib/encoding/zstd: disable CRC checks in pure Go build
This should give slightly better compression and decompressions performance.
Additionally this shaves off 4 bytes per each compressed block.
2019-07-24 19:17:32 +03:00
Aliaksandr Valialkin
54f035d4ce all: small updates after PR #114 2019-07-24 17:43:43 +03:00
Aliaksandr Valialkin
7a133567fb lib/encoding: small fixes in tests after the PR #114 2019-07-24 17:43:39 +03:00
Roman Khavronenko
fcf09aaa3c all: add Pure Go build (pull request #114)
Updates #94
2019-07-24 17:43:32 +03:00
Aliaksandr Valialkin
dd7bba94a3 dashboards: use rate instead of irate, because irate doesn't capture spikes
See https://medium.com/@valyala/why-irate-from-prometheus-doesnt-capture-spikes-45f9896d7832 for details
2019-07-20 15:55:48 +03:00
Aliaksandr Valialkin
3fae34eeb4 lib/encoding: improve gauge series detection
- Series with negative values are always gauges
- Counters may only have increasing values with possible counter resets

This should improve compression ratio for gauge series which
were previously mistakenly detected as counters.
2019-07-20 14:05:25 +03:00
Aliaksandr Valialkin
b335a811c3 deployment: switch builder from go1.12.6 to go1.12.7 2019-07-20 12:14:05 +03:00
Jiri Tyr
0aed0e0b5d Adding Grafana dashboards for VM cluster (#105) 2019-07-20 10:25:09 +03:00
Aliaksandr Valialkin
cb8104cf77 app: clarify error messages when -storageNode arg is missing in vminsert and vmselect 2019-07-20 10:21:59 +03:00
Aliaksandr Valialkin
fab1962e02 deployment/k8s/helm: use correct default ports for -storageNode
Previously these ports were swapped. Correct ports are:

- vminsert: -storageNode=*:8400
- vmselect: -storageNode=*:8401
2019-07-20 01:24:32 +03:00
Aliaksandr Valialkin
e3dcfe5851 deployment/docker/docker-compose.yml: use default ports for vminsert and vmselect services
These ports were swapped. Correct default ports are:

- vminsert: -httpListenAddr=:8480, -storageNode=*:8400
- vmselect: -httpListenAddr=:8481, -storageNode=*:8401
2019-07-20 01:20:08 +03:00
Thor Anker Kvisgård Lange
f576b267eb Fixed small bug in vmstorage name template
Signed-off-by: Thor Anker Kvisgård Lange <thanl@mhivestasoffshore.com>
2019-07-17 13:30:23 +03:00
Aliaksandr Valialkin
76b947dcb4 deployment: update Docker images 2019-07-15 23:56:24 +03:00
Aliaksandr Valialkin
7abb96b454 lib/netutil: do not count timeouts as network errors 2019-07-15 23:06:13 +03:00
Aliaksandr Valialkin
2b4254d01f app/vminsert: use netutil.TCPListener for collecting network-related metrics for Graphite and OpenTSDB TCP traffic 2019-07-15 22:58:35 +03:00
Aliaksandr Valialkin
092c9b39a8 app/vmselect/promql: remove empty time series after applying filters like q > 0
This should reduce CPU and RAM usage for queries over high number of time series.
2019-07-12 19:59:49 +03:00
Aliaksandr Valialkin
3bc9d3a14c vendor: update github.com/VictoriaMetrics/metrics to v1.7.0
This version adds support for `process_*` metrics similar
to metrics exposed by https://github.com/prometheus/client_golang .

Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/92
2019-07-12 17:24:58 +03:00
Aliaksandr Valialkin
6875fb411a app/vmselect/promql: parallelize incremental aggregation to multiple CPU cores
This may reduce response times for aggregation over big number of time series
with small step between output data points.
2019-07-12 15:53:12 +03:00
Aliaksandr Valialkin
be0ce54010 deployment: update docker images 2019-07-12 02:35:09 +03:00
Aliaksandr Valialkin
73a47d2a53 lib/storage: remove unused function isTooBigTimeRangeForDateMetricIDs 2019-07-12 02:28:40 +03:00
Aliaksandr Valialkin
97f9397687 lib/storage: do not reduce maxMetrics on time ranges exceeding maxDaysForDateMetricIDs
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/95
2019-07-12 02:21:52 +03:00
Aliaksandr Valialkin
1de6ef5f51 deployment: update Docker images 2019-07-11 19:10:35 +03:00
Aliaksandr Valialkin
4a8e6f47fe app/vmselect/prometheus: set start arg in /api/v1/series to the minimum allowed time by default as Prometheus does
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/91
2019-07-11 17:11:37 +03:00
Aliaksandr Valialkin
3313cdf816 app/vmselect/prometheus: convert negative times to 0, since they arent supported by the storage 2019-07-11 17:11:35 +03:00
Aliaksandr Valialkin
4ca66344ee lib/storage: do not pollute inverted index with data for samples outside the retention period 2019-07-11 17:11:33 +03:00
Aliaksandr Valialkin
0522efb2d6 lib/storage: add missing tagFilter.Marshal func 2019-07-11 15:01:01 +03:00
Aliaksandr Valialkin
12b1d67b41 lib/storage: use fast path for orSuffix when searching for metricIDs against plain tag value 2019-07-11 14:48:51 +03:00
Aliaksandr Valialkin
bf2e1b0ac1 lib/storage: remember and skip individual tag filters matching too many metrics
This saves CPU time by skipping useless matching for individual tag filters.
2019-07-11 14:48:47 +03:00
Aliaksandr Valialkin
cbab86fd9d app/vmselect/promql: reduce RAM usage for aggregates over big number of time series
Calculate incremental aggregates for `aggr(metric_selector)` function instead of
keeping all the time series matching the given `metric_selector` in memory.
2019-07-10 13:03:36 +03:00
Aliaksandr Valialkin
ba8195c58e all: consistency renaming: bytesSize -> sizeBytes 2019-07-10 00:47:42 +03:00
Aliaksandr Valialkin
df6f17b82c app/vmselect/promql: mention -search.logSlowQueryDuration flag value in the slow query log message 2019-07-10 00:43:01 +03:00
Aliaksandr Valialkin
73ae889244 app/vmselect/promql: extract rmoeveGroupTags function for removing unneeded tags from MetricName according to the given modifierExpr 2019-07-09 23:20:58 +03:00
Aliaksandr Valialkin
603b34edbd app/vmselect/promql: properly preserve metric name after applying functions in any case from transformFuncsKeepMetricGroup 2019-07-09 23:10:49 +03:00
Aliaksandr Valialkin
d6ec95693d app/vmselect/prometheus: typo fix 2019-07-07 23:34:04 +03:00
Aliaksandr Valialkin
61f6f63964 README.md: mention that vminsert spreads data amount vmstorage nodes using consistent hashing 2019-07-07 22:00:45 +03:00
Aliaksandr Valialkin
36636c1f6f app/vmselect/prometheus: handle minTime and maxTime values that may be set by Promxy or Prometheus client
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/88
2019-07-07 21:53:52 +03:00
Aliaksandr Valialkin
50c5894dc0 deployment: update docker images 2019-07-04 19:54:27 +03:00
Aliaksandr Valialkin
bba07d05fe app/vmselect/promql: remove empty timeseries left after topk call 2019-07-04 19:43:07 +03:00
Aliaksandr Valialkin
41f512af1c all: add vm_data_size_bytes metrics for easy monitoring of on-disk data size and on-disk inverted index size 2019-07-04 19:43:04 +03:00
Aliaksandr Valialkin
512a627855 app/vmselect/prometheus: update adjustLastPoints function
- Do not overwrite last points by the previous NaNs, since this may result in empty time series.
- Overwrite the last 2 points instead of 3. This should be enough in most cases.
2019-07-04 09:30:56 +03:00
Aliaksandr Valialkin
858746fa6c app/vmselect/promql: gracefully handle duplicate timestamps in irate and rollup_rate funcs
Previously such timestamps result in `+Inf` results. Now the previous timestamp is used
for the calculations.
2019-07-03 12:41:30 +03:00
Aliaksandr Valialkin
81da1c7b47 README.md: stylistic updates 2019-07-02 15:57:45 +03:00
Aliaksandr Valialkin
a3abed80ff app/vmselect: do not return empty time series in /api/v1/query result 2019-07-01 17:16:26 +03:00
Aliaksandr Valialkin
6682a35731 lib/memory: attempt #3 to determine memory limit for LXC container
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/84
2019-07-01 14:01:57 +03:00
Aliaksandr Valialkin
c3c60bee45 app/vmselect: add -search.denyPartialResponse flag for disabling partial responses if some of vmstorage nodes are unavailable
Also accept `deny_partial_response` query arg in Prometheus API handlers. If it is set to true,
then return error if some of vmstorage nodes are unavailable.
2019-06-30 01:27:07 +03:00
Aliaksandr Valialkin
60cff62586 deployment: update docker images 2019-06-29 21:25:20 +03:00
Aliaksandr Valialkin
b6ea1a7d5e lib/mergeset: make fmt 2019-06-29 14:25:46 +03:00
Aliaksandr Valialkin
ffc1bb00f6 lib/storage: skip non-matching metricIDs in sortedFilter
This should improve performance for big sorteFilter lists.
2019-06-29 13:49:40 +03:00
Aliaksandr Valialkin
2257dcd278 lib/mergeset: speed up binarySearchKey by skipping the first item during binary search 2019-06-29 13:49:32 +03:00
Aliaksandr Valialkin
72a3050c41 app/vmselect/promql: consistency renaming: candlestick -> rollup_candlestick 2019-06-29 03:13:25 +03:00
Aliaksandr Valialkin
6ea12a079e lib/memory: remove TestReadLXCMemoryLimit, since it doesnt work in Travis 2019-06-28 18:23:06 +03:00
Aliaksandr Valialkin
d0732d3137 lib/memory: attempt #2 to determine memory limit inside LXC container
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/84
2019-06-28 18:08:57 +03:00
Aliaksandr Valialkin
628571a837 README.md: update cluster scheme 2019-06-28 17:54:13 +03:00
Aliaksandr Valialkin
ad436757c3 lib/memory: an attempt to read proper memory limit inside LXC container
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/84
2019-06-28 15:35:02 +03:00
Aliaksandr Valialkin
c6598a8507 vendor: update github.com/VictoriaMetrics/metrics to v1.6.2
This fixes Summary printing for *_count and *_sum values with metric names containing labels.
2019-06-28 14:18:24 +03:00
Aliaksandr Valialkin
4f8cbc0782 vendor: update github.com/VictoriaMetrics/metrics to v1.6.1 2019-06-28 14:06:32 +03:00
Aliaksandr Valialkin
391bc8bf38 app/vmselect: fix 32bit arm build
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/83
2019-06-27 19:37:17 +03:00
Aliaksandr Valialkin
2d497c3b8e README.md: add cluster scheme image 2019-06-27 19:23:36 +03:00
Aliaksandr Valialkin
96342f1422 app/vmselect: add candlestick(m[d]) func for returning open, close, low and high rollups on the given time range d
This function is frequently used in financial apps. See https://en.wikipedia.org/wiki/Candlestick_chart
2019-06-27 18:46:54 +03:00
Aliaksandr Valialkin
416d27ef11 lib/storage: optimize time series search by regexp filter
This should improve search speed on label filters like `{foo=~"bar.+baz"}`
2019-06-27 16:18:00 +03:00
Aliaksandr Valialkin
5850a9ea78 README.md: improve wording on gossip protocol 2019-06-26 23:50:34 +03:00
Aliaksandr Valialkin
05b7cb1d42 README.md: return back the link to unsuccessful attempt to implement Gossip protocol in Thanos
This link provides good information on gossip fragility
2019-06-26 23:48:48 +03:00
Jiri Tyr
e7a0bf1a71 Change the default influxMeasurementFieldSeparator 2019-06-26 13:22:54 +03:00
Aliaksandr Valialkin
d5cb9fddd8 app/vminsert: fix inifinite loop when reading two lines without newline in the end
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/82
2019-06-26 02:52:56 +03:00
Aliaksandr Valialkin
916d9ef5b3 deployment: update docker images 2019-06-25 21:49:23 +03:00
Aliaksandr Valialkin
4f54bcf90b app/vmselect/promql: suppress error when template func is used inside modifier list. Just leave it as is
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/78
2019-06-25 20:43:57 +03:00
Aliaksandr Valialkin
72873f67aa README.md: improve docs for Helm chapter 2019-06-25 20:13:47 +03:00
Aliaksandr Valialkin
ee23a143b9 lib/storage: make sure non-nil args are passed to openIndexDB 2019-06-25 20:10:08 +03:00
Aliaksandr Valialkin
8b0a63722f lib/storage: reduce too big maxMetrics in getTagFilterWithMinMetricIDsCountAdaptive
This should improve performance on inverted index search for big amount of unique time series
when big -search.maxUniqueTimeseries is set.
2019-06-25 19:57:31 +03:00
Aliaksandr Valialkin
0263cb0adc lib/storage: free up memory from caches owned by indexDB when it is deleted 2019-06-25 14:41:16 +03:00
Aliaksandr Valialkin
362e187011 lib/storage: use unversioned keys for tag cache in extDB
Data in ExtDB cannot be changed, so it is OK to use unversioned keys for tag cache.
This should improve performance for index lookups over big amount of time series.
2019-06-25 13:15:42 +03:00
Aliaksandr Valialkin
51e2f3b48f lib/storage: skip searching in extDB if it doesn't contain items for the given time range
This should improve inverted index search performance for big amount
of unique time series when the search is performed only on recent data.
2019-06-25 12:57:56 +03:00
Aliaksandr Valialkin
dbc1e87bac deployment: update docker images 2019-06-24 23:11:03 +03:00
Aliaksandr Valialkin
d0bf4393a9 app/vmselect/promql: increase default value for -search.maxPointsPerTimeSeries from 10k to 30k
This may be required for subqueries with small steps. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/77
2019-06-24 22:53:25 +03:00
Aliaksandr Valialkin
334cf253c7 app/vmselect/promql: adjust value returned by linearRegression to the end of time range like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/71
2019-06-24 22:46:03 +03:00
Aliaksandr Valialkin
14cd628948 app/vmselect/promql: add sum2 and sum2_over_time, geomean and geomean_over_time funcs.
These functions may be useful for statistic calculations.
2019-06-24 16:45:00 +03:00
Aliaksandr Valialkin
fb9358635d lib/storage: mention source parts on merge error
This should improve determining broken source part.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/76
2019-06-24 14:09:46 +03:00
Aliaksandr Valialkin
0eac538fc8 app/vmselect/promql: adjust the provided window only for range functions with dt in denominator
This should fix range function calculations such as `changes(m[d])` where `d` is smaller
than the scrape interval.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/72
2019-06-23 19:27:25 +03:00
Aliaksandr Valialkin
ec57e59154 app/vmselect/promql: use deriv_fast instead of deriv in ttf, since deriv calculations have been changed recently 2019-06-23 15:54:12 +03:00
Aliaksandr Valialkin
516062b162 app/vmselect/promql: adjust ttf calculation, so deriv(freev) for freev=m[d] could be properly calculated 2019-06-23 14:31:36 +03:00
Aliaksandr Valialkin
5ea5ec4f44 vendor: update github.com/valyala/gozstd to v1.5.1 2019-06-22 00:14:11 +03:00
Aliaksandr Valialkin
ef6ca22c1d deployment: update docker images 2019-06-21 23:35:48 +03:00
Aliaksandr Valialkin
a4e040f5ef app/vmselect/promql: typo fixes in comments 2019-06-21 23:22:54 +03:00
Aliaksandr Valialkin
c05d443791 app/vmselect/promql: add deriv_fast function for calculating fast derivative
`deriv_fast` calculates derivative based on the first and the last point on the interval
instead of calculating linear regression based on all the data points on the interval.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/73
2019-06-21 23:05:48 +03:00
Aliaksandr Valialkin
98eafdbd58 app/vmselect/promql: use linear regression in deriv func like Prometheus does
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/73
2019-06-21 22:54:34 +03:00
Aliaksandr Valialkin
f334908c22 app/vmselect/promql: ajdust data model to the model used in Prometheus
Do not take into account data points on the range `[timestamp .. timestamp+step)`
when calculating value on the given `timestamp`.
Use only data points from the past when performing these calculations like Prometheus does.

This should reduce discrepancies between results returned by VictoriaMetrics
and results returned by Prometheus.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/72
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/71
2019-06-21 21:55:25 +03:00
Aliaksandr Valialkin
0fc4cb67dc deployment: update docker images 2019-06-21 13:39:45 +03:00
Aliaksandr Valialkin
837e349b7d app/vmselect/promql: do not strip __name__ form time series after binary comparison operation
Example:

  foo > 10

Would leave `foo` name for all the matching time series on the left.
2019-06-21 13:08:02 +03:00
Aliaksandr Valialkin
9164c223ec all: initial stubs for Windows support; see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/70 2019-06-20 20:07:41 +03:00
Aliaksandr Valialkin
786beb8fc8 Makefile: enable golangci-lint in make check_all 2019-06-20 15:00:58 +03:00
Aliaksandr Valialkin
9cac11db64 lib/storage: typo fixes found by golangci-lint; updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:38:45 +03:00
Aliaksandr Valialkin
7778030f9f lib/netutil: remove unused TCPListener.name; updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:36:19 +03:00
Aliaksandr Valialkin
e84b7641ef app/vmselect/promql: remove unused func keepLastValue; updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:35:19 +03:00
Aliaksandr Valialkin
db042bf6d6 app/vmselect/promql: typo fix; updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:33:52 +03:00
Aliaksandr Valialkin
dec2bdf89f Makefile: add make golangci-lint rule for running golangci-lint run; updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:32:34 +03:00
Aliaksandr Valialkin
3838d224d5 app/vminsert/opentsdb: remove unused const maxReadPacketSize; update https://github.com/VictoriaMetrics/VictoriaMetrics/issues/69 2019-06-20 14:30:02 +03:00
Aliaksandr Valialkin
a3a53647ba app/vmselect/prometheus: return better error messages on missing args to /api/v1/* 2019-06-20 14:07:44 +03:00
Aliaksandr Valialkin
a0c22a6830 app/vmstorage: add vm_cache_entries{type="storage/hour_metric_ids"} metric for tracking active time series count 2019-06-19 18:37:38 +03:00
Aliaksandr Valialkin
08e255a206 README.md: add link to source codes for cluster branch 2019-06-19 17:56:56 +03:00
Aliaksandr Valialkin
24ae3ef532 lib/prompb: remove superflouos bytes copying in ReadSnappy 2019-06-18 21:02:02 +03:00
Aliaksandr Valialkin
d4ed6189d4 app/vminsert/graphite: allow skipping timestamps in Graphite plaintext protocol
In this case VictoriaMetrics uses the ingestion time as a timestamp.
2019-06-18 19:05:46 +03:00
Aliaksandr Valialkin
7b93da5b57 vendor: update golang.org/x/sys 2019-06-18 16:20:09 +03:00
Aliaksandr Valialkin
2ebcd0c98b deployment: update docker images 2019-06-18 13:36:42 +03:00
Aliaksandr Valialkin
e40224d5de lib/flagutil: add NewArray helper func 2019-06-18 10:44:09 +03:00
Aliaksandr Valialkin
02417071cd README.md: use link to Wikipedia about broken gossip protocol instead of a link to document about Gossip protocol removal from Thanos
Thanos removed non-working gossip protocol a few months ago - https://github.com/improbable-eng/thanos/issues/734 ,
so the link to the design document https://github.com/improbable-eng/thanos/blob/master/docs/proposals/approved/201809_gossip-removal.md
became unavailable. So use a link to Wikipedia article instead.

Closes https://github.com/VictoriaMetrics/VictoriaMetrics/pull/68
2019-06-17 19:10:29 +03:00
Aliaksandr Valialkin
3b16d49514 app/vminsert/influx: add -influxSkipSingleField flag for using {measurement} instead of {measurement}{separator}{field_name} for Influx lines with a single field
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/66
2019-06-17 19:05:46 +03:00
Aliaksandr Valialkin
5f0b3589b2 app/vminsert/influx: add -influxMeasurementFieldSeparator flag for the ability to change separator for {measurement}{separator}{field_name} metric name 2019-06-14 09:57:13 +03:00
Aliaksandr Valialkin
14edd122a6 deployment/docker: switch builder from go1.12.5 to go1.12.6 2019-06-14 09:31:56 +03:00
Aliaksandr Valialkin
f9e1d32168 lib/storage: persist metric ids for the current and the previous hour on graceful shutdown
This should improve performance after restart when the db contains a lot of time series
with high time series churn (i.e. metrics from Kubernetes with many pods and frequent deployments)
2019-06-14 07:55:09 +03:00
Aliaksandr Valialkin
ba3cccd471 deployment: update docker images 2019-06-12 23:31:06 +03:00
Aliaksandr Valialkin
947bc16f8c app/vmselect/promql: use dynamic limit on memory for concurrent queries 2019-06-12 23:18:23 +03:00
Aliaksandr Valialkin
fe1b33ef1a README.md: mention that accountID is known as tenant 2019-06-12 21:32:10 +03:00
Aliaksandr Valialkin
8567e3463d app/vmselect/promql: merge non-overlapping duplicate time series in group_left and group_right joins 2019-06-12 20:33:01 +03:00
Aliaksandr Valialkin
345ecc37b6 deployment: update docker images 2019-06-12 18:36:17 +03:00
Aliaksandr Valialkin
88005237f4 app/vmselect/promql: swap binary operation with modifier in the error message for improved readability 2019-06-12 17:14:33 +03:00
Aliaksandr Valialkin
a71381ad2a app/vmselect/promql: list a sample of duplicate time series in the error message for group_left or group_right
This should improve troubleshooting for complex queries involving `group_left` and `group_right` modifiers.
2019-06-12 16:57:34 +03:00
Aliaksandr Valialkin
b0b93e3d50 lib/fs: sync parent dir in MustRemoveAll only if it exists
The parent directory may be non-existing when the deleted directory
didn't exist before the MustRemoveAll call
2019-06-12 02:16:15 +03:00
Aliaksandr Valialkin
18d6f293f7 lib/fs: consolidate *RemoveAll* funcs into a single MustRemoveAll func
The func syncs parent dir in order to persist directory removal
in the event of power loss
2019-06-12 01:55:18 +03:00
Aliaksandr Valialkin
28d9904efc lib/fs: panic with fatal error when directories cannot be removed
Unremoved directories may lead to inconsistent data directory,
so VictoriaMetrics will fail to start next time.

So panic on the first error when trying to remove directory in order
to simplify recover process.
2019-06-12 01:20:10 +03:00
Aliaksandr Valialkin
d897bc3f08 lib/fs: attempt #2 to work around NFS issue with directory removal
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61
2019-06-12 01:07:29 +03:00
Aliaksandr Valialkin
f165500225 vendor: update github.com/VictoriaMetrics/fastcache to v1.5.1 2019-06-11 23:57:15 +03:00
Aliaksandr Valialkin
d1ca2e5a2d deployment: update docker images 2019-06-11 23:22:51 +03:00
Aliaksandr Valialkin
51e2e255a6 lib/fs: consistency renaming SyncPath -> MustSyncPath, since it doesnt return error 2019-06-11 23:13:45 +03:00
Aliaksandr Valialkin
3fa4c28f6b lib/fs: make sure the created directory remains visible in the fs in the event of power loss
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/63
2019-06-11 23:08:17 +03:00
Aliaksandr Valialkin
0b7f751f60 lib/fs: use filepath.Dir instead of filepath.Split, since the filename is unused 2019-06-11 22:54:23 +03:00
Aliaksandr Valialkin
cb9e746484 deployment: update docker images 2019-06-11 22:02:08 +03:00
Aliaksandr Valialkin
b491045a4b lib/{storage,mergeset}: sync filenames inside part when finalizing the part
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/63
2019-06-11 21:51:19 +03:00
Aliaksandr Valialkin
3437c30180 all: try hard removing directory with contents
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61
2019-06-11 01:58:08 +03:00
Aliaksandr Valialkin
f2a8599908 deployment: update docker images 2019-06-11 01:09:52 +03:00
Aliaksandr Valialkin
eea7da8e0c app/vmselect/promql: prevent from count_values explosion of timeseries, which could result in OOM 2019-06-11 01:03:18 +03:00
Aliaksandr Valialkin
e87a602209 app/vmselect/promql: skip superflouos timestamps copying in count_values 2019-06-11 00:44:09 +03:00
Aliaksandr Valialkin
ec84febc1c app/vmselect/promql: remove superflouos timeseries copy in histogram_quantile func 2019-06-11 00:39:35 +03:00
Aliaksandr Valialkin
1fab34fb5c app/vmselect/promql: remove superflouos timeseries copy in union func 2019-06-11 00:35:09 +03:00
Aliaksandr Valialkin
a6f368499d app/vmselect/promql: skip NaN values in count_values func 2019-06-10 22:42:41 +03:00
Aliaksandr Valialkin
2d7165033a deployment: update docker images 2019-06-10 20:38:18 +03:00
Aliaksandr Valialkin
945894e049 app/vmselect: properly handle empty label (aka __name__) in LabelEntries handler 2019-06-10 19:55:02 +03:00
Aliaksandr Valialkin
75a0acf72d app/vmselect: add /api/v1/labels/count handler for quick detection of labels with the maximum number of distinct values 2019-06-10 19:54:55 +03:00
Aliaksandr Valialkin
547bcdce63 app/vmstorage: enable compression of responses to vmselect by default
This should save vmstorage => vmselect network bandwidth in common case
when recently added data is queried.
2019-06-10 14:54:59 +03:00
Aliaksandr Valialkin
0ccedbdfd2 lib/storage: mention the accountID and projectID in error message when filtering out other (accountID, projectID) entries 2019-06-10 14:43:53 +03:00
Aliaksandr Valialkin
d54f5fec0b lib/storage: skip adaptive searching for tag filter matching the minimum number of metrics if the identical previous search didn't found such filter
This should improve speed for searching metrics among high number of time series
with high churn rate like in big Kubernetes clusters with frequent deployments.
2019-06-10 14:07:47 +03:00
Aliaksandr Valialkin
27e50e86f4 lib/storage: factor out getTagFilterWithMinMetricIDsCountAdaptive from updateMetricIDsForTagFilters 2019-06-10 13:26:00 +03:00
Aliaksandr Valialkin
b69d3dbd0c lib/storage: filter out metricIDs from another (AccountID, ProjectID) in getMetricIDsForRecentHours 2019-06-10 13:05:16 +03:00
Aliaksandr Valialkin
3059ae7be0 lib/storage: give clearer names to more functions 2019-06-10 12:59:33 +03:00
Aliaksandr Valialkin
d3a024d2d6 lib/storage: give more clear names to functions 2019-06-10 12:50:22 +03:00
Aliaksandr Valialkin
00e0760608 lib/storage: test GetSeriesCount 2019-06-10 12:40:33 +03:00
Aliaksandr Valialkin
e4cba5a7ed lib/storage: make getSeriesCount func indexSearch method 2019-06-10 12:29:24 +03:00
Aliaksandr Valialkin
4c3913290a app/vmstorage: add missing _total suffixes to newly added metrics 2019-06-09 22:11:41 +03:00
Aliaksandr Valialkin
d882afa905 lib/storage: optimize time series lookup for recent hours when the db contains many millions of time series with high churn rate (aka frequent deployments in Kubernetes) 2019-06-09 19:14:04 +03:00
Aliaksandr Valialkin
5fcdb4a59a app/vminsert: improve handling of unhealthy vmstorage nodes
* Spread load evenly among remaining healthy nodes instead of hammering
  the next node after the unhealthy node.
* Make sure that the packet is flushed to storage node before returning success.
  Previously packets could stay in local buffers and thus lost on connection errors.
* Keep rows in the limited memory when all the storage nodes are unhealthy.
2019-06-09 00:42:36 +03:00
Aliaksandr Valialkin
0f64673327 app/vminsert/concurrencylimiter: typo fix in the error message 2019-06-08 22:43:56 +03:00
Aliaksandr Valialkin
89a113cb5d app/vminsert: really fix #60
ReadLinesBlock may accept dstBuf with non-zero length. In this case the last line without trailing newline isn't read.
Fix this by comparing len(dstBuf) to 0 instead of its original length.
2019-06-07 23:40:10 +03:00
Aliaksandr Valialkin
e1c45b314a app/vminsert: properly read trailing line without newline in the end
This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/60
2019-06-07 23:18:34 +03:00
Aliaksandr Valialkin
8cf0a0e59c app/vminsert: split vm_rows_inserted_total into per-(accountID, projectID) metrics
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/59
2019-06-07 22:11:20 +03:00
Aliaksandr Valialkin
8b2a6c6182 deployment: update docker images 2019-06-07 11:58:33 +03:00
Aliaksandr Valialkin
30c7652bad deployment/docker: move cluster docker images from valyala/vm* to victoriametrics/vm* docker hub path 2019-06-07 11:55:37 +03:00
Aliaksandr Valialkin
41d087662c deployment: update docker image 2019-06-07 11:40:54 +03:00
Aliaksandr Valialkin
913f888d0c app/vmselect/promql: properly handle {__name__ op "string"} queries
This has been broken in 7294ef333ad26f4f6578b783e97649e58b1f8945 .
2019-06-07 02:02:09 +03:00
Seua Polyakov
5e51ce386e Add SIGINT as stopsignal to docker file (#54)
Add sigint as stopsignal to docker file. You can find more here: https://docs.docker.com/engine/reference/builder/#usage
With this change, the main process inside the container will receive SIGINT, and after a grace period, SIGKILL.

(cherry picked from commit f4e63cd070)
2019-06-06 22:38:48 +03:00
Aliaksandr Valialkin
11979e4d85 app/vmselect/prometheus: report about incorrect time or duration instead of silently using the default value
This should prevent from incorrect usage of the querying API.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/52
2019-06-06 22:17:15 +03:00
Aliaksandr Valialkin
5f2aa4539a app/vminsert: add multi-tenancy support for OpenTSDB and Graphite ingestion via custom tags
* VictoriaMetrics_AccountID tag may be used for setting AccountID
* VictoriaMetrics_ProjectID tag may be used for setting ProjectID
2019-06-06 18:07:30 +03:00
Aliaksandr Valialkin
c98582695f deployment: update docker images 2019-06-06 17:39:31 +03:00
Aliaksandr Valialkin
8f4790625d app/vmselect/promql: return the correct time series from quantile
Previously arbitrary time series could be returned from `quantile`
depending on sort order for the last data point in the selected range.

Fix this by returning the calculated time series.

Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/55
2019-06-06 17:33:53 +03:00
Aliaksandr Valialkin
2ff0d595b0 app/vmselect/promql: add -search.disableCache flag for disabling response caching
This may be useful for data back-filling, when the response caching
could interfere badly with newly added data points with timestamps
in the past.
2019-06-04 17:30:41 +03:00
Aliaksandr Valialkin
595a421295 deployment: update docker images 2019-06-03 23:00:10 +03:00
Aliaksandr Valialkin
ba58af9d8c app/vminsert/influx: take into account all the tags for consistent hash calculations 2019-06-03 22:54:21 +03:00
Aliaksandr Valialkin
db21d46417 app/vminsert: emulate influx/query request, which is required for TSBS benchmark 2019-06-03 18:39:46 +03:00
Aliaksandr Valialkin
8ad0fb5689 deployment: update docker images 2019-06-03 18:21:18 +03:00
Aliaksandr Valialkin
31d6566aff app/vminsert: accept data on /insert/<accountID>/prometheus/api/v1/write 2019-06-03 18:18:09 +03:00
Aliaksandr Valialkin
c3d73e347c deployment/k8s/helm: update NOTES.txt 2019-06-03 17:53:36 +03:00
Aliaksandr Valialkin
cf75d1f0fc README.md: mention that unused snapshots must be deleted 2019-06-03 17:26:35 +03:00
Aliaksandr Valialkin
a06b7f7f84 app/vmselect/netstorage: remove spammy error message when certain vmstorage nodes are unavailable during query execution
The amount of partial responses may be tracked by `vm_partial_search_results_total` metric.
2019-06-03 17:09:50 +03:00
Aliaksandr Valialkin
1d87abc8eb lib/procutil: typo fix in comment to WaitForSigterm 2019-06-03 16:54:37 +03:00
Aliaksandr Valialkin
a2986cde70 lib/storage: tune updating a map with today`s metric ids
- Increase update iterval from 1s to 10s. This should reduce CPU usage
  for large amounts of metric ids with constant churn.
- Reduce pendingTodayMetricIDsLock lock duration during the update.
2019-06-02 22:00:13 +03:00
Aliaksandr Valialkin
e27fd5148a lib/storage: speed up checking metricID existence in the list for the current date 2019-06-02 18:34:20 +03:00
Aliaksandr Valialkin
d7bafde77e vendor: update deps with make vendor-update 2019-06-01 23:40:56 +03:00
Aliaksandr Valialkin
53242105fb app/vmselect/promql: allow escaping identifiers with \ and \xXX
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/42
2019-05-31 17:35:54 +03:00
Aliaksandr Valialkin
25269682c2 deployment: update docker images 2019-05-29 17:44:14 +03:00
Aliaksandr Valialkin
950310d1c3 Add sections about replication and backups 2019-05-29 13:14:01 +03:00
Aliaksandr Valialkin
ee776ca8fc app/vminsert: add -maxConcurrentInserts command-line flag for limiting the number of concurrent inserts 2019-05-29 12:40:22 +03:00
Aliaksandr Valialkin
a1289d7343 Makefile: run go vet with -mod=vendor in order to disable downloading vendored deps 2019-05-29 01:38:24 +03:00
Aliaksandr Valialkin
a4ec139a4a app/vminsert: reduce memory usage for Influx, Graphite and OpenTSDB protocols
Do not buffer per-connection data and just store it as it arrives
2019-05-28 18:47:52 +03:00
Aliaksandr Valialkin
a6d02ff275 lib/timerpool: use timer pool in concurrency limiters
This should reduce the number of memory allocations in highly loaded system
2019-05-28 17:30:10 +03:00
Aliaksandr Valialkin
6e90aaeb8c Makefile: add -mod=vendor to go test, so tests use external deps from vendor folder 2019-05-27 00:35:59 +03:00
Aliaksandr Valialkin
3b52adaf3f Makefile: pass GO111MODULE=on to all the go invocations 2019-05-26 23:23:21 +03:00
Aliaksandr Valialkin
c944de68cd vendor: update dependencies with make vendor-update 2019-05-26 23:18:42 +03:00
Aliaksandr Valialkin
b7a91d6ba7 app/vmselect: update comment according to the updated code 2019-05-26 22:39:09 +03:00
Aliaksandr Valialkin
15d1e15ae6 app/vminsert/influx: try converting string values to numeric values, since Influx agents may send numeric values as strings
Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/34
2019-05-26 22:12:55 +03:00
Aliaksandr Valialkin
a2c71f18a3 app/vmselect/promql: misspeling fix 2019-05-25 21:53:48 +03:00
Aliaksandr Valialkin
bdf696ef18 all: fix misspellings 2019-05-25 21:51:24 +03:00
Aliaksandr Valialkin
121a920a18 Makefile: add -s flag to go fmt in make fmt command 2019-05-25 21:44:36 +03:00
Aliaksandr Valialkin
a10d27eccd README.md: mention that new vmstorage node must have the same -retentionPeriod as the previous nodes 2019-05-25 17:18:44 +03:00
Aliaksandr Valialkin
c254adba7c README.md: mention that VictoriaMetrics is scalable 2019-05-25 17:09:15 +03:00
Aliaksandr Valialkin
affeb677cc README.md: mention that the majority of users should use single-node version instead of cluster version 2019-05-25 14:09:17 +03:00
Aliaksandr Valialkin
2ff996e276 app/vmselect: log slow queries if their execution time exceeds -search.logSlowQueryDuration 2019-05-24 16:14:46 +03:00
Aliaksandr Valialkin
628708ad76 app/vmselect: consume resultsCh data in exportHandler if writeResponseFunc failed to consume it 2019-05-24 14:54:54 +03:00
Aliaksandr Valialkin
209ad975ae deployment: update docker images 2019-05-24 13:16:20 +03:00
Aliaksandr Valialkin
9b64dfee4b lib/encoding: add vm_zstd_block_{compress|decompress}_calls_total for determining the number CompressZSTD / DecompressZSTD calls 2019-05-24 13:01:15 +03:00
Aliaksandr Valialkin
364f4ec3bb all: remove -p XXXX:XXXX from docker run options, since it is unnesessary if --net=host is set 2019-05-24 12:53:12 +03:00
Aliaksandr Valialkin
f37903adb3 app/vminsert: add -rpc.disableCompression command-line flag for reducing CPU usage at the cost of higher network bandwidth usage 2019-05-24 12:51:07 +03:00
Aliaksandr Valialkin
b23352dc9e lib/encoding: add vm_zstd_block_{original|compressed}_bytes_total metrics for rough estimation of block compression ratio 2019-05-24 12:34:51 +03:00
Aliaksandr Valialkin
f67f40d63a lib/encoding: substitute CompressZSTD with CompressZSTDLevel 2019-05-24 12:32:49 +03:00
Aliaksandr Valialkin
a26e774eca lib/httpserver: add -http.disableResponseCompression flag, which may help saving CPU resources at the cost of higher network bandwidth usage 2019-05-24 12:20:36 +03:00
Aliaksandr Valialkin
8e3eb5b39d app/vmselect/promql: add alias(q, name) function that sets the given name to all the time series in q 2019-05-24 02:42:10 +03:00
Aliaksandr Valialkin
820cdae88d lib/decimal: add a comment explaining weird code in maxUpExponent. Fixes #29 2019-05-23 17:18:50 +03:00
Aliaksandr Valialkin
bb048937bc app/vmselect/promql: add label_transform(q, label, regexp, replacement) function for replacing all the occurences of regexp with replacement in the given label for q 2019-05-23 16:26:07 +03:00
Aliaksandr Valialkin
54346de548 README.md: typo fix 2019-05-23 02:25:54 +03:00
Aliaksandr Valialkin
b98789ae9f README.md: mention that VictoriaMetrics is high-perf cost-effective TSDB 2019-05-23 00:41:08 +03:00
Aliaksandr Valialkin
24578b4bb1 all: open-sourcing cluster version 2019-05-23 00:25:38 +03:00
2014 changed files with 542279 additions and 99478 deletions

55
.github/workflows/main.yml vendored Normal file
View File

@@ -0,0 +1,55 @@
name: main
on:
- push
- pull_request
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@main
with:
go-version: 1.15
id: go
- name: Dependencies
run: |
go get -u golang.org/x/lint/golint
go get -u github.com/kisielk/errcheck
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.29.0
- name: Code checkout
uses: actions/checkout@master
- name: Build
env:
GO111MODULE: on
run: |
export PATH=$PATH:$(go env GOPATH)/bin # temporary fix. See https://github.com/actions/setup-go/issues/14
make check-all
git diff --exit-code
make test-full
make test-pure
make test-full-386
make vminsert vmselect vmstorage
make vminsert-pure vmselect-pure vmstorage-pure
make vmutils
GOOS=freebsd go build -mod=vendor ./app/vminsert
GOOS=freebsd go build -mod=vendor ./app/vmselect
GOOS=freebsd go build -mod=vendor ./app/vmstorage
GOOS=freebsd go build -mod=vendor ./app/vmagent
GOOS=freebsd go build -mod=vendor ./app/vmalert
GOOS=freebsd go build -mod=vendor ./app/vmbackup
GOOS=freebsd go build -mod=vendor ./app/vmrestore
GOOS=openbsd go build -mod=vendor ./app/vminsert
GOOS=openbsd go build -mod=vendor ./app/vmselect
GOOS=openbsd go build -mod=vendor ./app/vmstorage
GOOS=openbsd go build -mod=vendor ./app/vmagent
GOOS=openbsd go build -mod=vendor ./app/vmalert
GOOS=openbsd go build -mod=vendor ./app/vmbackup
GOOS=openbsd go build -mod=vendor ./app/vmrestore
GOOS=darwin go build -mod=vendor ./app/vminsert
GOOS=darwin go build -mod=vendor ./app/vmselect
GOOS=darwin go build -mod=vendor ./app/vmstorage
GOOS=darwin go build -mod=vendor ./app/vmagent
GOOS=darwin go build -mod=vendor ./app/vmalert
GOOS=darwin go build -mod=vendor ./app/vmbackup
GOOS=darwin go build -mod=vendor ./app/vmrestore

8
.gitignore vendored
View File

@@ -7,5 +7,13 @@
*.swp
/gocache-for-docker
/victoria-metrics-data
/vmagent-remotewrite-data
/vmstorage-data
/vmselect-cache
.DS_Store
### terraform
terraform.tfstate
terraform.tfstate.*
.terraform/

120
CODE_OF_CONDUCT_RU.md Normal file
View File

@@ -0,0 +1,120 @@
# Кодекс Поведения участника
## Наши обязательства
Мы, как участники, авторы и лидеры обязуемся сделать участие в сообществе
свободным от притеснений для всех, независимо от возраста, телосложения,
видимых или невидимых ограничений способности, этнической принадлежности,
половых признаков, гендерной идентичности и выражения, уровня опыта,
образования, социо-экономического статуса, национальности, внешности,
расы, религии, или сексуальной идентичности и ориентации.
Мы обещаем действовать и взаимодействовать таким образом, чтобы вносить вклад в открытое,
дружелюбное, многообразное, инклюзивное и здоровое сообщество.
## Наши стандарты
Примеры поведения, создающие условия для благоприятных взаимоотношений включают в себя:
* Проявление доброты и эмпатии к другим участникам проекта
* Уважение к чужой точке зрения и опыту
* Конструктивная критика и принятие конструктивной критики
* Принятие ответственности, принесение извинений тем, кто пострадал от наших ошибок
и извлечение уроков из опыта
* Ориентирование на то, что лучше подходит для сообщества, а не только для нас лично
Примеры неприемлемого поведения участников включают в себя:
* Использование выражений или изображений сексуального характера и нежелательное сексуальное внимание или домогательство в любой форме
* Троллинг, оскорбительные или уничижительные комментарии, переход на личности или затрагивание политических убеждений
* Публичное или приватное домогательство
* Публикация личной информации других лиц, например, физического или электронного адреса, без явного разрешения
* Иное поведение, которое обоснованно считать неуместным в профессиональной обстановке
## Обязанности
Лидеры сообщества отвечают за разъяснение и применение наших стандартов приемлемого
поведения и будут предпринимать соответствующие и честные меры по исправлению положения
в ответ на любое поведение, которое они сочтут неприемлемым, угрожающим, оскорбительным или вредным.
Лидеры сообщества обладают правом и обязанностью удалять, редактировать или отклонять
комментарии, коммиты, код, изменения в вики, вопросы и другой вклад, который не совпадает
с Кодексом Поведения, и предоставят причины принятого решения, когда сочтут нужным.
## Область применения
Данный Кодекс Поведения применим во всех во всех публичных физических и цифровых пространства сообщества,
а также когда человек официально представляет сообщество в публичных местах.
Примеры представления проекта или сообщества включают использование официальной электронной почты,
публикации в официальном аккаунте в социальных сетях,
или упоминания как представителя в онлайн или оффлайн мероприятии.
## Приведение в исполнение
О случаях домогательства, а так же оскорбительного или иного другого неприемлемого
поведения можно сообщить ответственным лидерам сообщества с помощью письма на info@victoriametrics.com
Все жалобы будут рассмотрены и расследованы оперативно и беспристрастно.
Все лидеры сообщества обязаны уважать неприкосновенность частной жизни и личную
неприкосновенность автора сообщения.
## Руководство по исполнению
Лидеры сообщества будут следовать следующим Принципам Воздействия в Сообществе,
чтобы определить последствия для тех, кого они считают виновными в нарушении данного Кодекса Поведения:
### 1. Исправление
**Общественное влияние**: Использование недопустимой лексики или другое поведение,
считающиеся непрофессиональным или нежелательным в сообществе.
**Последствия**: Личное, письменное предупреждение от лидеров сообщества,
объясняющее суть нарушения и почему такое поведение
было неуместно. Лидеры сообщества могут попросить принести публичное извинение.
### 2. Предупреждение
**Общественное влияние**: Нарушение в результате одного инцидента или серии действий.
**Последствия**: Предупреждение о последствиях в случае продолжающегося неуместного поведения.
На определенное время не допускается взаимодействие с людьми, вовлеченными в инцидент,
включая незапрошенное взаимодействие
с теми, кто обеспечивает соблюдение Кодекса. Это включает в себя избегание взаимодействия
в публичных пространствах, а так же во внешних каналах,
таких как социальные сети. Нарушение этих правил влечет за собой временный или вечный бан.
### 3. Временный бан
**Общественное влияние**: Серьёзное нарушение стандартов сообщества,
включая продолжительное неуместное поведение.
**Последствия**: Временный запрет (бан) на любое взаимодействие
или публичное общение с сообществом на определенный период времени.
На этот период не допускается публичное или личное взаимодействие с людьми,
вовлеченными в инцидент, включая незапрошенное взаимодействие
с теми, кто обеспечивает соблюдение Кодекса.
Нарушение этих правил влечет за собой вечный бан.
### 4. Вечный бан
**Общественное влияние**: Демонстрация систематических нарушений стандартов сообщества,
включая продолжающееся неуместное поведение, домогательство до отдельных лиц,
или проявление агрессии либо пренебрежительного отношения к категориям лиц.
**Последствия**: Вечный запрет на любое публичное взаимодействие с сообществом.
## Атрибуция
Данный Кодекс Поведения основан на [Кодекс Поведения участника][homepage],
версии 2.0, доступной по адресу
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
Принципы Воздействия в Сообществе были вдохновлены [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
[homepage]: https://www.contributor-covenant.org
Ответы на общие вопросы о данном кодексе поведения ищите на странице FAQ:
https://www.contributor-covenant.org/faq. Переводы доступны по адресу
https://www.contributor-covenant.org/translations.

View File

@@ -175,7 +175,7 @@
END OF TERMS AND CONDITIONS
Copyright 2019 VictoriaMetrics, Inc.
Copyright 2019-2020 VictoriaMetrics, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

145
Makefile
View File

@@ -1,7 +1,7 @@
PKG_PREFIX := github.com/VictoriaMetrics/VictoriaMetrics
BUILDINFO_TAG ?= $(shell echo $$(git describe --long --all | tr '/' '-')$$( \
git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | sha1sum | grep -oP '^.{8}')))
git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | openssl sha1 | cut -c 10-17)))
PKG_TAG ?= $(shell git tag -l --points-at HEAD)
ifeq ($(PKG_TAG),)
@@ -11,7 +11,24 @@ endif
GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)'
all: \
victoria-metrics-prod
vminsert \
vmselect \
vmstorage \
vmagent \
vmalert \
vmauth \
vmbackup \
vmrestore
all-pure: \
vminsert-pure \
vmselect-pure \
vmstorage-pure \
vmagent-pure \
vmalert-pure \
vmauth-pure \
vmbackup-pure \
vmrestore-pure
include app/*/Makefile
include deployment/*/Makefile
@@ -19,46 +36,142 @@ include deployment/*/Makefile
clean:
rm -rf bin/*
release: victoria-metrics-prod
cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz victoria-metrics-prod
publish: \
publish-vminsert \
publish-vmselect \
publish-vmstorage \
publish-vmagent \
publish-vmalert \
publish-vmauth \
publish-vmbackup \
publish-vmrestore
package: \
package-vminsert \
package-vmselect \
package-vmstorage \
package-vmagent \
package-vmalert \
package-vmauth \
package-vmbackup \
package-vmrestore
vmutils: \
vmagent \
vmalert \
vmauth \
vmbackup \
vmrestore
release: \
release-vmcluster \
release-vmutils
release-vmcluster: \
vminsert-prod \
vmselect-prod \
vmstorage-prod
cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz vminsert-prod vmselect-prod vmstorage-prod && \
sha256sum victoria-metrics-$(PKG_TAG).tar.gz > victoria-metrics-$(PKG_TAG)_checksums.txt
release-vmutils: \
vmagent-prod \
vmalert-prod \
vmauth-prod \
vmbackup-prod \
vmrestore-prod
cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmalert-prod vmauth-prod vmbackup-prod vmrestore-prod && \
sha256sum vmutils-$(PKG_TAG).tar.gz > vmutils-$(PKG_TAG)_checksums.txt
pprof-cpu:
go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)
fmt:
go fmt $(PKG_PREFIX)/lib/...
go fmt $(PKG_PREFIX)/app/...
GO111MODULE=on gofmt -l -w -s ./lib
GO111MODULE=on gofmt -l -w -s ./app
vet:
go vet $(PKG_PREFIX)/lib/...
go vet $(PKG_PREFIX)/app/...
GO111MODULE=on go vet -mod=vendor ./lib/...
GO111MODULE=on go vet -mod=vendor ./app/...
lint: install-golint
golint lib/...
golint app/...
install-golint:
which golint || GO111MODULE=off go get -u github.com/golang/lint/golint
which golint || go install golang.org/x/lint/golint
errcheck: install-errcheck
errcheck -exclude=errcheck_excludes.txt ./lib/...
errcheck -exclude=errcheck_excludes.txt ./app/vminsert/...
errcheck -exclude=errcheck_excludes.txt ./app/vmselect/...
errcheck -exclude=errcheck_excludes.txt ./app/vmstorage/...
errcheck -exclude=errcheck_excludes.txt ./app/vmagent/...
errcheck -exclude=errcheck_excludes.txt ./app/vmalert/...
errcheck -exclude=errcheck_excludes.txt ./app/vmauth/...
errcheck -exclude=errcheck_excludes.txt ./app/vmbackup/...
errcheck -exclude=errcheck_excludes.txt ./app/vmrestore/...
install-errcheck:
which errcheck || GO111MODULE=off go get -u github.com/kisielk/errcheck
which errcheck || go install github.com/kisielk/errcheck
check-all: fmt vet lint errcheck golangci-lint
test:
go test $(PKG_PREFIX)/lib/...
GO111MODULE=on go test -mod=vendor ./lib/... ./app/...
test-race:
GO111MODULE=on go test -mod=vendor -race ./lib/... ./app/...
test-pure:
GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor ./lib/... ./app/...
test-full:
GO111MODULE=on go test -mod=vendor -coverprofile=coverage.txt -covermode=atomic ./lib/... ./app/...
test-full-386:
GO111MODULE=on GOARCH=386 go test -mod=vendor -coverprofile=coverage.txt -covermode=atomic ./lib/... ./app/...
benchmark:
go test -bench=. $(PKG_PREFIX)/lib/...
GO111MODULE=on go test -mod=vendor -bench=. ./lib/...
GO111MODULE=on go test -mod=vendor -bench=. ./app/...
benchmark-pure:
GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor -bench=. ./lib/...
GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor -bench=. ./app/...
vendor-update:
go get -u
go mod tidy
go mod vendor
GO111MODULE=on go get -u -d ./lib/...
GO111MODULE=on go get -u -d ./app/...
GO111MODULE=on go mod tidy
GO111MODULE=on go mod vendor
app-local:
CGO_ENABLED=1 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
app-local-pure:
CGO_ENABLED=0 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-pure$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
app-local-with-goarch:
GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-$(GOARCH)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
quicktemplate-gen: install-qtc
qtc
install-qtc:
which qtc || GO111MODULE=off go get -u github.com/valyala/quicktemplate/qtc
which qtc || go install github.com/valyala/quicktemplate/qtc
golangci-lint: install-golangci-lint
golangci-lint run --exclude '(SA4003|SA1019|SA5011):' -D errcheck -D structcheck --timeout 2m
install-golangci-lint:
which golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell go env GOPATH)/bin v1.29.0
docs-sync:
cp app/vmagent/README.md docs/vmagent.md
cp app/vmalert/README.md docs/vmalert.md
cp app/vmauth/README.md docs/vmauth.md
cp app/vmbackup/README.md docs/vmbackup.md
cp app/vmrestore/README.md docs/vmrestore.md
cp README.md docs/Cluster-VictoriaMetrics.md

673
README.md
View File

@@ -1,386 +1,383 @@
<img text-align="center" alt="Victoria Metrics" src="logo.png">
# Cluster version
## Single-node VictoriaMetrics
<img alt="Victoria Metrics" src="logo.png">
[![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
VictoriaMetrics is fast, cost-effective and scalable time series database. It can be used as a long-term remote storage for Prometheus.
VictoriaMetrics is a long-term remote storage for Prometheus.
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) and
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics).
It is recommended using [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics) instead of cluster version
for ingestion rates lower than a million of data points per second.
Single-node version [scales perfectly](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae)
with the number of CPU cores, RAM and available storage space.
Single-node version is easier to configure and operate comparing to cluster version, so think twice before sticking to cluster version.
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
Join [our Slack](http://slack.victoriametrics.com/) or [contact us](mailto:info@victoriametrics.com) with consulting and support questions.
## Prominent features
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/ExtendedPromQL).
* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4).
[Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality).
* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
may be crammed into a limited storage comparing to TimescaleDB.
* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b).
* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, Uber M3, Cortex, InfluxDB or TimescaleDB.
See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
* Easy operation:
* VictoriaMetrics consists of a single executable without external dependencies.
* All the configuration is done via explicit command-line flags with reasonable defaults.
* All the data is stored in a single directory pointed by `-storageDataPath` flag.
* Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
* Supports metrics' ingestion and backfilling via the following protocols:
* [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
* [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
* [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon)
if `-graphiteListenAddr` is set.
* [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set.
* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors.
* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
- Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics).
- Performance and capacity scales horizontally. See [these docs for details](#cluster-resizing-and-scalability).
- Supports multiple independent namespaces for time series data (aka multi-tenancy). See [these docs for details](#multitenancy).
- Supports replication. See [these docs for details](#replication-and-data-safety).
## Operation
## Architecture overview
VictoriaMetrics cluster consists of the following services:
- `vmstorage` - stores the data
- `vminsert` - proxies the ingested data to `vmstorage` shards using consistent hashing
- `vmselect` - performs incoming queries using the data from `vmstorage`
Each service may scale independently and may run on the most suitable hardware.
`vmstorage` nodes don't know about each other, don't communicate with each other and don't share any data.
This is [shared nothing architecture](https://en.wikipedia.org/wiki/Shared-nothing_architecture).
It increases cluster availability, simplifies cluster maintenance and cluster scaling.
<img src="https://docs.google.com/drawings/d/e/2PACX-1vTvk2raU9kFgZ84oF-OKolrGwHaePhHRsZEcfQ1I_EC5AB_XPWwB392XshxPramLJ8E4bqptTnFn5LL/pub?w=1104&amp;h=746">
### Table of contents
## Multitenancy
* [How to build from sources](#how-to-build-from-sources)
* [How to start VictoriaMetrics](#how-to-start-victoriametrics)
* [Prometheus setup](#prometheus-setup)
* [Grafana setup](#grafana-setup)
* [How to send data from InfluxDB-compatible agents such as Telegraf](#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf)
* [How to send data from Graphite-compatible agents such as StatsD](#how-to-send-data-from-graphite-compatible-agents-such-as-statsd)
* [How to send data from OpenTSDB-compatible agents](#how-to-send-data-from-opentsdb-compatible-agents)
* [How to apply new config / ugrade VictoriaMetrics](#how-to-apply-new-config--upgrade-victoriametrics)
* [How to work with snapshots](#how-to-work-with-snapshots)
* [How to delete time series](#how-to-delete-time-series)
* [How to export time series](#how-to-export-time-series)
* [Federation](#federation)
* [Capacity planning](#capacity-planning)
* [High Availability](#high-availability)
* [Multiple retentions](#multiple-retentions)
* [Scalability and cluster version](#scalability-and-cluster-version)
* [Security](#security)
* [Tuning](#tuning)
* [Monitoring](#monitoring)
* [Troubleshooting](#troubleshooting)
* [Community and contributions](#community-and-contributions)
* [Reporting bugs](#reporting-bugs)
VictoriaMetrics cluster supports multiple isolated tenants (aka namespaces).
Tenants are identified by `accountID` or `accountID:projectID`, which are put inside request urls.
See [these docs](#url-format) for details. Some facts about tenants in VictoriaMetrics:
* Each `accountID` and `projectID` is identified by an arbitrary 32-bit integer in the range `[0 .. 2^32)`.
If `projectID` is missing, then it is automatically assigned to `0`. It is expected that other information about tenants
such as auth tokens, tenant names, limits, accounting, etc. is stored in a separate relational database. This database must be managed
by a separate service sitting in front of VictoriaMetrics cluster such as [vmauth](https://victoriametrics.github.io/vmauth.html).
[Contact us](mailto:info@victoriametrics.com) if you need help with creating such a service.
* Tenants are automatically created when the first data point is written into the given tenant.
* Data for all the tenants is evenly spread among available `vmstorage` nodes. This guarantees even load among `vmstorage` nodes
when different tenants have different amounts of data and different query load.
* VictoriaMetrics doesn't support querying multiple tenants in a single request.
### How to build from sources
## Binaries
We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) instead of building VictoriaMetrics
from sources. Building from sources is reasonable when developing an additional features specific
to your needs.
Compiled binaries for cluster version are available in the `assets` section of [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
See archives containing `cluster` word.
Docker images for cluster version are available here:
- `vminsert` - https://hub.docker.com/r/victoriametrics/vminsert/tags
- `vmselect` - https://hub.docker.com/r/victoriametrics/vmselect/tags
- `vmstorage` - https://hub.docker.com/r/victoriametrics/vmstorage/tags
#### Development build
## Building from sources
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
2. Run `go build ./app/victoria-metrics` from the root folder of the repository.
It will build `victoria-metrics` binary in the root folder of the repository.
Source code for cluster version is available at [cluster branch](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make victoria-metrics-prod` from the root folder of the respository.
It will build `victoria-metrics-prod` binary and put it into the `bin` folder.
### Production builds
#### Building docker images
There is no need in installing Go on a host system since binaries are built
inside [the official docker container for Go](https://hub.docker.com/_/golang).
This makes reproducible builds.
So [install docker](https://docs.docker.com/install/) and run the following command:
```
make vminsert-prod vmselect-prod vmstorage-prod
```
Production binaries are built into statically linked binaries. They are put into `bin` folder with `-prod` suffixes:
```
$ make vminsert-prod vmselect-prod vmstorage-prod
$ ls -1 bin
vminsert-prod
vmselect-prod
vmstorage-prod
```
### Development Builds
1. [Install go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make` from the repository root. It should build `vmstorage`, `vmselect`
and `vminsert` binaries and put them into the `bin` folder.
### Building docker images
Run `make package`. It will build the following docker images locally:
* `victoriametrics/vminsert:<PKG_TAG>`
* `victoriametrics/vmselect:<PKG_TAG>`
* `victoriametrics/vmstorage:<PKG_TAG>`
Run `make package-victoria-metrics`. It will build `valyala/victoria-metrics:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package`.
By default images are built on top of [alpine](https://hub.docker.com/_/scratch) image in order to improve debuggability.
It is possible to build an image on top of any other base image by setting it via `<ROOT_IMAGE>` environment variable.
For example, the following command builds images on top of [scratch](https://hub.docker.com/_/scratch) image:
### How to start VictoriaMetrics
Just start VictoriaMetrics executable or docker image with the desired command-line flags.
The following command line flags are used the most:
* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory.
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted.
* `-httpListenAddr` - TCP address to listen to for http requests. By default it listens port `8428` on all the network interfaces.
* `-graphiteListenAddr` - TCP and UDP address to listen to for Graphite data. By default it is disabled.
* `-opentsdbListenAddr` - TCP and UDP address to listen to for OpenTSDB data. By default it is disabled.
Pass `-help` to see all the available flags with description and default values.
### Prometheus setup
Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`):
```yml
remote_write:
- url: http://<victoriametrics-addr>:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
```bash
ROOT_IMAGE=scratch make package
```
Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
Then apply the new config via the following command:
## Operation
```
kill -HUP `pidof prometheus`
```
### Cluster setup
Prometheus writes incoming data to local storage and to remote storage in parallel.
This means the data remains available in local storage for `--storage.tsdb.retention.time` duration
if remote storage stops working.
A minimal cluster must contain the following nodes:
If you plan sending data to VictoriaMetrics from multiple Prometheus instances, then add the following lines into `global` section
of [Prometheus config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#configuration-file):
* a single `vmstorage` node with `-retentionPeriod` and `-storageDataPath` flags
* a single `vminsert` node with `-storageNode=<vmstorage_host>:8400`
* a single `vmselect` node with `-storageNode=<vmstorage_host>:8401`
```yml
global:
external_labels:
datacenter: dc-123
```
It is recommended to run at least two nodes for each service
for high availability purposes.
This instructs Prometheus to add `datacenter=dc-123` label to each time series sent to remote storage.
The label name may be arbitrary - `datacenter` is just an example. The label value must be unique
across Prometheus instances, so time series may be filtered and grouped by this label.
An http load balancer such as `nginx` must be put in front of `vminsert` and `vmselect` nodes:
- requests starting with `/insert` must be routed to port `8480` on `vminsert` nodes.
- requests starting with `/select` must be routed to port `8481` on `vmselect` nodes.
Ports may be altered by setting `-httpListenAddr` on the corresponding nodes.
### Grafana setup
It is recommended setting up [monitoring](#monitoring) for the cluster.
Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following Url:
#### Environment variables
```
http://<victoriametrics-addr>:8428
```
Each flag values can be set thru environment variables by following these rules:
Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
Then build graphs with the created datasource using [Prometheus query language](https://prometheus.io/docs/prometheus/latest/querying/basics/).
VictoriaMetrics supports native PromQL and [extends it with useful features](ExtendedPromQL).
### How to send data from InfluxDB-compatible agents such as [Telegraf](https://www.influxdata.com/time-series-platform/telegraf/)?
Just use `http://<victoriametric-addr>:8428` url instead of InfluxDB url in agents' configs.
For instance, put the following lines into `Telegraf` config, so it sends data to VictoriaMetrics instead of InfluxDB:
```
[[outputs.influxdb]]
urls = ["http://<victoriametrics-addr>:8428"]
```
Do not forget substituting `<victoriametrics-addr>` with the real address where VictoriaMetrics runs.
VictoriaMetrics maps Influx data using the following rules:
* [`db` query arg](https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint) is mapped into `db` label value
* Field names are mapped to time series names prefixed by `{measurement}.` value
* Field values are mapped to time series values
* Tags are mapped to Prometheus labels as-is
### How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)?
1) Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
the following command will enable Graphite receiver in VictoriaMetrics on TCP and UDP port `2003`:
```
/path/to/victoria-metrics-prod ... -graphiteListenAddr=:2003
```
2) Use the configured address in Graphite-compatible agents. For instance, set `graphiteHost`
to the VictoriaMetrics host in `StatsD` configs.
### How to send data from OpenTSDB-compatible agents?
1) Enable OpenTSDB receiver in VictoriaMetrics by setting `-opentsdbListenAddr` command line flag. For instance,
the following command will enable OpenTSDB receiver in VictoriaMetrics on TCP and UDP port `4242`:
```
/path/to/victoria-metrics-prod ... -opentsdbListenAddr=:4242
```
2) Send data to the given address from OpenTSDB-compatible agents.
### How to apply new config / upgrade VictoriaMetrics?
VictoriaMetrics must be restarted in order to upgrade or apply new config:
1) Send `SIGINT` signal to VictoriaMetrics process in order to gracefully stop it.
2) Wait until the process stops. This can take a few seconds.
3) Start the upgraded VictoriaMetrics with new config.
### How to work with snapshots?
Navigate to `http://<victoriametrics-addr>:8428/snapshot/create` in order to create an instant snapshot.
The page will return the following JSON response:
```
{"status":"ok","snapshot":"<snapshot-name>"}
```
Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-storageDataPath>`
is the command-line flag value. Snapshots can be archived to backup storage via `rsync -L`, `scp -r`
or any similar tool that follows symlinks during copying.
The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete?snapshot=<snapshot-name>` in order
to delete `<snapshot-name>` snapshot.
Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete_all` in order to delete all the snapshots.
### How to delete time series?
Send a request to `http://<victoriametrics-addr>:8428/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`,
where `<timeseries_selector_for_delete>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
for metrics to delete. After that all the time series matching the given selector are deleted. Storage space for
the deleted time series isn't freed instantly - it is freed during subsequent merges of data files.
### How to export time series?
Send a request to `http://<victoriametrics-addr>:8428/api/v1/export?match[]=<timeseries_selector_for_export>`,
where `<timeseries_selector_for_export>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
for metrics to export. The response would contain all the data for the selected time series in [JSON streaming format](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON).
Each JSON line would contain data for a single time series. An example output:
```
{"metric":{"__name__":"up","job":"node_exporter","instance":"localhost:9100"},"values":[0,0,0],"timestamps":[1549891472010,1549891487724,1549891503438]}
{"metric":{"__name__":"up","job":"prometheus","instance":"localhost:9090"},"values":[1,1,1],"timestamps":[1549891461511,1549891476511,1549891491511]}
```
Optional `start` and `end` args may be added to the request in order to limit the time frame for the exported data. These args may contain either
unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values.
### Federation
VictoriaMetrics exports [Prometheus-compatible federation data](https://prometheus.io/docs/prometheus/latest/federation/)
at `http://<victoriametrics-addr>:8428/federate?match[]=<timeseries_selector_for_federation>`.
Optional `start` and `end` args may be added to the request in order to scrape the last point for each selected time series on the `[start ... end]` interval.
`start` and `end` may contain either unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values. By default the last point
on the interval `[now - max_lookback ... now]` is scraped for each time series. Default value for `max_lookback` is `5m` (5 minutes), but can be overriden.
For instance, `/federate?match[]=up&max_lookback=1h` would return last points on the `[now - 1h ... now]` interval. This may be useful for time series federation
with scrape intervals exceeding `5m`.
### Capacity planning
Rough estimation of the required resources:
* RAM size: less than 1KB per active time series. So, ~1GB of RAM is required for 1M active time series.
Time series is considered active if new data points have been added to it recently or if it has been recently queried.
VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with `-memory.allowedPercent` flag.
* CPU cores: a CPU core per 300K inserted data points per second. So, ~4 CPU cores are required for processing
the insert stream of 1M data points per second.
If you see lower numbers per CPU core, then it is likely active time series info doesn't fit caches,
so you need more RAM for lowering CPU usage.
* Storage size: less than a byte per data point on average. So, ~260GB is required for storing a month-long insert stream
of 100K data points per second.
The actual storage size heavily depends on data randomness (entropy). Higher randomness means higher storage size requirements.
### High availability
1) Install multiple VictoriaMetrics instances in distinct datacenters.
2) Add addresses of these instances to `remote_write` section in Prometheus config:
```yml
remote_write:
- url: http://<victoriametrics-addr-1>:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
# ...
- url: http://<victoriametrics-addr-N>:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
```
3) Apply the updated config:
```
kill -HUP `pidof prometheus`
```
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
6) Set up Prometheus datasource in Grafana that points to Promxy.
### Multiple retentions
Just start multiple VictoriaMetrics instances with distinct values for the following flags:
* `-retentionPeriod`
* `-storageDataPath`, so the data for each retention period is saved in a separate directory
* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
### Scalability and cluster version
Though single-node VictoriaMetrics cannot scale to multiple nodes, it is optimized for resource usage - storage size / bandwidth / IOPS, RAM, CPU.
This means that a single-node VictoriaMetrics may scale vertically and substitute moderately sized cluster built with competing solutions
such as Thanos, Uber M3, InfluxDB or TimescaleDB.
So try single-node VictoriaMetrics at first and then [switch to cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster) if you still need
horizontally scalable long-term remote storage for really large Prometheus deployments.
[Contact us](mailto:info@victoriametrics.com) for paid support.
### Security
Do not forget protecting sensitive endpoints in VictoriaMetrics when exposing it to untrusted networks such as internet.
Consider setting the following command-line flags:
* `-tls`, `-tlsCertFile` and `-tlsKeyFile` for switching from HTTP to HTTPS.
* `-httpAuth.username` and `-httpAuth.password` for protecting all the HTTP endpoints
with [HTTP Basic Authentication](https://en.wikipedia.org/wiki/Basic_access_authentication).
* `-deleteAuthKey` for protecting `/api/v1/admin/tsdb/delete_series` endpoint. See [how to delete time series](#how-to-delete-time-series).
* `-snapshotAuthKey` for protecting `/snapshot*` endpoints. See [how to work with snapshots](#how-to-work-with-snapshots).
Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
### Tuning
* There is no need in VictoriaMetrics tuning, since it uses reasonable defaults for command-line flags,
which are automatically adjusted for the available CPU and RAM resources.
* There is no need in Operating System tuning, since VictoriaMetrics is optimized for default OS settings.
The only option is increasing the limit on [the number open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a),
so Prometheus instances could establish more connections to VictoriaMetrics.
- The `-envflag.enable` flag must be set
- Each `.` in flag names must be substituted by `_` (for example `-insert.maxQueueDuration <duration>` will translate to `insert_maxQueueDuration=<duration>`)
- For repeating flags, an alternative syntax can be used by joining the different values into one using `,` as separator (for example `-storageNode <nodeA> -storageNode <nodeB>` will translate to `storageNode=<nodeA>,<nodeB>`)
- It is possible setting prefix for environment vars with `-envflag.prefix`. For instance, if `-envflag.prefix=VM_`, then env vars must be prepended with `VM_`
### Monitoring
VictoriaMetrics exports internal metrics in Prometheus format on the `/metrics` page.
Add this page to Prometheus' scrape config in order to collect VictoriaMetrics metrics.
There is [an official Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229).
All the cluster components expose various metrics in Prometheus-compatible format at `/metrics` page on the TCP port set in `-httpListenAddr` command-line flag.
By default the following TCP ports are used:
- `vminsert` - 8480
- `vmselect` - 8481
- `vmstorage` - 8482
It is recommended setting up [vmagent](https://victoriametrics.github.io/vmagent.html)
or Prometheus to scrape `/metrics` pages from all the cluster components, so they can be monitored and analyzed
with [the official Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176)
or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11831).
### Troubleshooting
### URL format
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
* URLs for data ingestion: `http://<vminsert>:8480/insert/<accountID>/<suffix>`, where:
- `<accountID>` is an arbitrary 32-bit integer identifying namespace for data ingestion (aka tenant). It is possible to set it as `accountID:projectID`,
where `projectID` is also arbitrary 32-bit integer. If `projectID` isn't set, then it equals to `0`.
- `<suffix>` may have the following values:
- `prometheus` and `prometheus/api/v1/write` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
- `influx/write` and `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/).
- `opentsdb/api/put` - for accepting [OpenTSDB HTTP /api/put requests](http://opentsdb.net/docs/build/html/api_http/put.html).
This handler is disabled by default. It is exposed on a distinct TCP address set via `-opentsdbHTTPListenAddr` command-line flag.
See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#sending-opentsdb-data-via-http-apiput-requests) for details.
- `prometheus/api/v1/import` - for importing data obtained via `api/v1/export` on `vmselect` (see below).
- `prometheus/api/v1/import/native` - for importing data obtained via `api/v1/export/native` on `vmselect` (see below).
- `prometheus/api/v1/import/csv` - for importing arbitrary CSV data. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-csv-data) for details.
- `prometheus/api/v1/import/prometheus` - for importing data in Prometheus exposition format. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
* URLs for [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/): `http://<vmselect>:8481/select/<accountID>/prometheus/<suffix>`, where:
- `<accountID>` is an arbitrary number identifying data namespace for the query (aka tenant)
- `<suffix>` may have the following values:
- `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries).
- `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
- `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers).
- `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names).
- `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values).
- `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/).
- `api/v1/export` - exports raw data in JSON line format. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details.
- `api/v1/export/native` - exports raw data in native binary format. It may be imported into another VictoriaMetrics via `api/v1/import/native` (see above).
- `api/v1/export/csv` - exports data in CSV. It may be imported into another VictoriaMetrics via `api/v1/import/csv` (see above).
- `api/v1/status/tsdb` - for time series stats. See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
VictoriaMetrics accepts optional `topN=N` and `date=YYYY-MM-DD` query args for this handler, where `N` is the number of top entries to return in the response
and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
- `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries,
which is returned in the response.
* URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://<vmselect>:8481/select/<accountID>/graphite/<suffix>`, where:
- `<accountID>` is an arbitrary number identifying data namespace for query (aka tenant)
- `<suffix>` may have the following values:
- `metrics/find` - searches Graphite metrics. See [these docs](https://graphite-api.readthedocs.io/en/latest/api.html#metrics-find).
- `metrics/expand` - expands Graphite metrics. See [these docs](https://graphite-api.readthedocs.io/en/latest/api.html#metrics-expand).
- `metrics/index.json` - returns all the metric names. See [these docs](https://graphite-api.readthedocs.io/en/latest/api.html#metrics-index-json).
- `tags/tagSeries` - registers time series. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#adding-series-to-the-tagdb).
- `tags/tagMultiSeries` - register multiple time series. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#adding-series-to-the-tagdb).
- `tags` - returns tag names. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags).
- `tags/<tag_name>` - returns tag values for the given `<tag_name>`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags).
- `tags/findSeries` - returns series matching the given `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#exploring-tags).
- `tags/autoComplete/tags` - returns tags matching the given `tagPrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support).
- `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support).
- `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb).
* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't
be used on a regular basis, since it carries non-zero overhead.
* `vmstorage` nodes provide the following HTTP endpoints on `8482` port:
- `/internal/force_merge` - initiate [forced compactions](https://victoriametrics.github.io/#forced-merge) on the given `vmstorage` node.
- `/snapshot/create` - create [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282),
which can be used for backups in background. Snapshots are created in `<storageDataPath>/snapshots` folder, where `<storageDataPath>` is the corresponding
command-line flag value.
- `/snapshot/list` - list available snasphots.
- `/snapshot/delete?snapshot=<id>` - delete the given snapshot.
- `/snapshot/delete_all` - delete all the snapshots.
Snapshots may be created independently on each `vmstorage` node. There is no need in synchronizing snapshots' creation
across `vmstorage` nodes.
### Cluster resizing and scalability
Cluster performance and capacity scales with adding new nodes.
* `vminsert` and `vmselect` nodes are stateless and may be added / removed at any time.
Do not forget updating the list of these nodes on http load balancer.
Adding more `vminsert` nodes scales data ingestion rate. See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175#issuecomment-536925841)
about ingestion rate scalability.
Adding more `vmselect` nodes scales select queries rate.
* `vmstorage` nodes own the ingested data, so they cannot be removed without data loss.
Adding more `vmstorage` nodes scales cluster capacity.
Steps to add `vmstorage` node:
1. Start new `vmstorage` node with the same `-retentionPeriod` as existing nodes in the cluster.
2. Gradually restart all the `vmselect` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8401`.
3. Gradually restart all the `vminsert` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8400`.
### Updating / reconfiguring cluster nodes
All the node types - `vminsert`, `vmselect` and `vmstorage` - may be updated via graceful shutdown.
Send `SIGINT` signal to the corresponding process, wait until it finishes and then start new version
with new configs.
Cluster should remain in working state if at least a single node of each type remains available during
the update process. See [cluster availability](#cluster-availability) section for details.
### Cluster availability
* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
* The cluster remains available if at least a single `vmstorage` node exists:
- `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
- `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.
Data replication can be used for increasing storage durability. See [these docs](#replication-and-data-safety) for details.
### Capacity planning
Each instance type - `vminsert`, `vmselect` and `vmstorage` - can run on the most suitable hardware.
#### vminsert
* The recommended total number of vCPU cores for all the `vminsert` instances can be calculated from the ingestion rate: `vCPUs = ingestion_rate / 150K`.
* The recommended number of vCPU cores per each `vminsert` instance should equal to the number of `vmstorage` instances in the cluster.
* The amount of RAM per each `vminsert` instance should be 1GB or more. RAM is used as a buffer for spikes in ingestion rate.
The maximum amount of used RAM per `vminsert` node can be tuned with `-memory.allowedPercent` or `-memory.allowedBytes` command-line flags.
For instance, `-memory.allowedPercent=20` limits the maximum amount of used RAM to 20% of the available RAM on the host system.
* Sometimes `-rpc.disableCompression` command-line flag on `vminsert` instances could increase ingestion capacity at the cost
of higher network bandwidth usage between `vminsert` and `vmstorage`.
#### vmstorage
* The recommended total number of vCPU cores for all the `vmstorage` instances can be calculated from the ingestion rate: `vCPUs = ingestion_rate / 150K`.
* The recommended total amount of RAM for all the `vmstorage` instances can be calculated from the number of active time series: `RAM = 2 * active_time_series * 1KB`.
Time series is active if it received at least a single data point during the last hour or if it has been queried during the last hour.
The required RAM per each `vmstorage` should be multiplied by `-replicationFactor` if [replication](#replication-and-data-safety) is enabled.
Additional RAM can be required for query processing.
Calculated RAM requrements may differ from actual RAM requirements due to various factors:
* The average number of labels per time series. More labels require more RAM.
* The average length of label names and label values. Longer labels require more RAM.
* The type of queries. Heavy queries that scan big number of time series over long time ranges require more RAM.
* The recommended total amount of storage space for all the `vmstorage` instances can be calculated
from the ingestion rate and retention: `storage_space = ingestion_rate * retention_seconds`.
#### vmselect
The recommended hardware for `vmselect` instances highly depends on the type of queries. Lightweight queries over small number of time series usually require
small number of vCPU cores and small amount of RAM on `vmselect`, while heavy queries over big number of time series (>10K) usually require
bigger number of vCPU cores and bigger amounts of RAM.
In general it is recommended increasing the number of vCPU cores and RAM per `vmselect` node for higher query performance,
while adding new `vmselect` nodes only when old nodes are overloaded with incoming query stream.
### High availability
It is recommended to run all the components for a single cluster in the same subnetwork with high bandwidth, low latency and low error rates.
This improves cluster performance and availability.
It isn't recommended spreading components for a single cluster across multiple availability zones, since cross-AZ network usually has lower bandwidth, higher latency
and higher error rates comparing the network inside AZ.
If you need multi-AZ setup, then it is recommended running independed clusters in each AZ and setting up
[vmagent](https://victoriametrics.github.io/vmagent.html) in front of these clusters, so it could replicate incoming data
into all the cluster. Then [promxy](https://github.com/jacksontj/promxy) could be used for querying the data from multiple clusters.
### Helm
Helm chart simplifies managing cluster version of VictoriaMetrics in Kubernetes.
It is available in the [helm-charts](https://github.com/VictoriaMetrics/helm-charts) repository.
### Kubernetes operator
[K8s operator](https://github.com/VictoriaMetrics/operator) simplifies managing VictoriaMetrics components in Kubernetes.
### Replication and data safety
In order to enable application-level replication, `-replicationFactor=N` command-line flag must be passed to `vminsert`.
This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable.
For example, when `-replicationFactor=3` is passed to `vminsert`, then it replicates all the ingested data to 3 distinct `vmstorage` nodes.
When the replication is enabled, `-replicationFactor=N` and `-dedup.minScrapeInterval=1ms` command-line flag must be passed to `vmselect` nodes.
The `-replicationFactor=N` improves query performance when a part of vmstorage nodes respond slowly and/or temporarily unavailable.
The `-dedup.minScrapeInterval=1ms` de-duplicates replicated data during queries. It is OK if `-dedup.minScrapeInterval` exceeds 1ms
when [deduplication](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#deduplication) is used additionally to replication.
Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883),
so it is recommended performing regular backups. See [these docs](#backups) for details.
By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs),
since they are protected from data loss and data corruption. They also provide consistently high performance
and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime.
HDD-based persistent disks should be enough for the majority of use cases.
It is recommended using durable replicated persistent volumes in Kubernetes.
### Backups
It is recommended performing periodical backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
for protecting from user errors such as accidental data deletion.
The following steps must be performed for each `vmstorage` node for creating a backup:
1. Create an instant snapshot by navigating to `/snapshot/create` HTTP handler. It will create snapshot and return its name.
2. Archive the created snapshot from `<-storageDataPath>/snapshots/<snapshot_name>` folder using [vmbackup](https://victoriametrics.github.io/vbackup.html).
The archival process doesn't interfere with `vmstorage` work, so it may be performed at any suitable time.
3. Delete unused snapshots via `/snapshot/delete?snapshot=<snapshot_name>` or `/snapshot/delete_all` in order to free up occupied storage space.
There is no need in synchronizing backups among all the `vmstorage` nodes.
Restoring from backup:
1. Stop `vmstorage` node with `kill -INT`.
2. Restore data from backup using [vmrestore](https://victoriametrics.github.io/vmrestore.html) into `-storageDataPath` directory.
3. Start `vmstorage` node.
## Community and contributions
Feel free asking any questions regarding VictoriaMetrics [here](https://groups.google.com/forum/#!forum/victorametrics-users).
We are open to third-party pull requests provided they follow [KISS design principle](https://en.wikipedia.org/wiki/KISS_principle):
- Prefer simple code and architecture.
@@ -392,6 +389,16 @@ We are open to third-party pull requests provided they follow [KISS design princ
Adhering `KISS` principle simplifies the resulting code and architecture, so it can be reviewed, understood and verified by many people.
Due to `KISS` cluster version of VictoriaMetrics has no the following "features" popular in distributed computing world:
- Fragile gossip protocols. See [failed attempt in Thanos](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm).
- Complex replication schemes, which may go nuts in unforesseen edge cases. See [replication docs](#replication-and-data-safety) for details.
- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability.
- Automatic cluster resizing, which may cost you a lot of money if improperly configured.
- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :)
- Automatic leader election, which may result in split brain disaster on network errors.
## Reporting bugs

View File

@@ -1,21 +0,0 @@
# All these commands must run from repository root.
victoria-metrics-prod:
APP_NAME=victoria-metrics $(MAKE) app-via-docker
package-victoria-metrics:
APP_NAME=victoria-metrics \
$(MAKE) package-via-docker
publish-victoria-metrics:
APP_NAME=victoria-metrics $(MAKE) publish-via-docker
run-victoria-metrics:
mkdir -p victoria-metrics-data
DOCKER_OPTS='-v $(shell pwd)/victoria-metrics-data:/victoria-metrics-data -p 8428:8428 -p 2003:2003 -p 2003:2003/udp' \
APP_NAME=victoria-metrics \
ARGS='-graphiteListenAddr=:2003 -opentsdbListenAddr=:4242 -retentionPeriod=12 -search.maxUniqueTimeseries=1000000 -search.maxQueryDuration=10m' \
$(MAKE) run-via-docker
victoria-metrics-arm:
CC=arm-linux-gnueabi-gcc CGO_ENABLED=1 GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/victoria-metrics-arm ./app/victoria-metrics

View File

@@ -1,5 +0,0 @@
FROM scratch
COPY --from=local/certs:1.0.2 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
COPY bin/victoria-metrics-prod .
EXPOSE 8428
ENTRYPOINT ["/victoria-metrics-prod"]

View File

@@ -1,60 +0,0 @@
package main
import (
"flag"
"net/http"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
var httpListenAddr = flag.String("httpListenAddr", ":8428", "TCP address to listen for http connections")
func main() {
flag.Parse()
buildinfo.Init()
logger.Init()
logger.Infof("starting VictoraMetrics at %q...", *httpListenAddr)
startTime := time.Now()
vmstorage.Init()
vmselect.Init()
vminsert.Init()
go httpserver.Serve(*httpListenAddr, requestHandler)
logger.Infof("started VictoriaMetrics in %s", time.Since(startTime))
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
startTime = time.Now()
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
vminsert.Stop()
logger.Infof("successfully shut down the webservice in %s", time.Since(startTime))
vmstorage.Stop()
vmselect.Stop()
logger.Infof("the VictoriaMetrics has been stopped in %s", time.Since(startTime))
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if vminsert.RequestHandler(w, r) {
return true
}
if vmselect.RequestHandler(w, r) {
return true
}
if vmstorage.RequestHandler(w, r) {
return true
}
return false
}

80
app/vmagent/Makefile Normal file
View File

@@ -0,0 +1,80 @@
# All these commands must run from repository root.
vmagent:
APP_NAME=vmagent $(MAKE) app-local
vmagent-race:
APP_NAME=vmagent RACE=-race $(MAKE) app-local
vmagent-prod:
APP_NAME=vmagent $(MAKE) app-via-docker
vmagent-pure-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-pure
vmagent-amd64-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-amd64
vmagent-arm-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-arm
vmagent-arm64-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-arm64
vmagent-ppc64le-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-ppc64le
vmagent-386-prod:
APP_NAME=vmagent $(MAKE) app-via-docker-386
package-vmagent:
APP_NAME=vmagent $(MAKE) package-via-docker
package-vmagent-pure:
APP_NAME=vmagent $(MAKE) package-via-docker-pure
package-vmagent-amd64:
APP_NAME=vmagent $(MAKE) package-via-docker-amd64
package-vmagent-arm:
APP_NAME=vmagent $(MAKE) package-via-docker-arm
package-vmagent-arm64:
APP_NAME=vmagent $(MAKE) package-via-docker-arm64
package-vmagent-ppc64le:
APP_NAME=vmagent $(MAKE) package-via-docker-ppc64le
package-vmagent-386:
APP_NAME=vmagent $(MAKE) package-via-docker-386
publish-vmagent:
APP_NAME=vmagent $(MAKE) publish-via-docker
run-vmagent:
mkdir -p vmagent-remotewrite-data
DOCKER_OPTS='-v $(shell pwd)/vmagent-remotewrite-data:/vmagent-remotewrite-data' \
ARGS='-remoteWrite.url=http://localhost:8428/api/v1/write' \
APP_NAME=vmagent \
$(MAKE) run-via-docker
vmagent-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmagent-local-with-goarch
vmagent-arm:
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmagent-local-with-goarch
vmagent-arm64:
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmagent-local-with-goarch
vmagent-ppc64le:
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmagent-local-with-goarch
vmagent-386:
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmagent-local-with-goarch
vmagent-local-with-goarch:
APP_NAME=vmagent $(MAKE) app-local-with-goarch
vmagent-pure:
APP_NAME=vmagent $(MAKE) app-local-pure

383
app/vmagent/README.md Normal file
View File

@@ -0,0 +1,383 @@
## vmagent
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
<img alt="vmagent" src="vmagent.png">
### Motivation
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
Also, we found that users infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
### Features
* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
See [Quick Start](#quick-start) for details.
* Can add, remove and modify labels (aka tags) via Prometheus relabeling. Can filter data before sending it to remote storage. See [these docs](#relabeling) for details.
* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
* Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
* Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
* OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-opentsdb-compatible-agents).
* Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
* JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-json-line-format).
* Native data import protocol via `http://<vmagent>:8429/api/v1/import/native`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-native-format).
* Data in Prometheus exposition format. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-data-in-prometheus-exposition-format) for details.
* Arbitrary CSV data via `http://<vmagent>:8429/api/v1/import/csv`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-import-csv-data).
* Can replicate collected metrics simultaneously to multiple remote storage systems.
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
### Quick Start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary number of remote storage systems.
Example command line:
```
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
If you only need to collect Influx data, then the following is sufficient:
```
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
Then send Influx data to `http://vmagent-host:8429`. See [these docs](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/tags).
Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
### Configuration update
`vmagent` should be restarted in order to update config options set via command-line args.
`vmagent` supports multiple approaches for reloading configs from updated config files such as `-promscrape.config`, `-remoteWrite.relabelConfig` and `-remoteWrite.urlRelabelConfig`:
* Sending `SUGHUP` signal to `vmagent` process:
```bash
kill -SIGHUP `pidof vmagent`
```
* Sending HTTP request to `http://vmagent:8429/-/reload` endpoint.
There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file.
### Use cases
#### IoT and Edge monitoring
`vmagent` can run and collect metrics in IoT and industrial networks with unreliable or scheduled connections to the remote storage.
It buffers the collected data in local files until the connection to remote storage becomes available and then sends the buffered
data to the remote storage. It re-tries sending the data to remote storage on any errors.
The maximum buffer size can be limited with `-remoteWrite.maxDiskUsagePerURL`.
`vmagent` works on various architectures from IoT world - 32-bit arm, 64-bit arm, ppc64, 386, amd64.
See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/Makefile) for details.
#### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
#### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
#### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
#### Splitting data streams among multiple systems
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
#### Prometheus remote_write proxy
`vmagent` may be used as a proxy for Prometheus data sent via Prometheus `remote_write` protocol. It can accept data via `remote_write` API
at `/api/v1/write` endpoint, apply relabeling and filtering and then proxy it to another `remote_write` systems.
The `vmagent` can be configured to encrypt the incoming `remote_write` requests with `-tls*` command-line flags.
Additionally, Basic Auth can be enabled for the incoming `remote_write` requests with `-httpAuth.*` command-line flags.
### How to collect metrics in Prometheus format
Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
* `global`
* `scrape_configs`
All the other sections are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
Use `-remoteWrite.*` command-line flags instead for configuring remote write settings.
The following scrape types in [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) section are supported:
* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
* `kubernetes_sd_configs` - for scraping targets in Kubernetes (k8s).
See [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config) for details.
* `ec2_sd_configs` - for scraping targets in Amazon EC2.
See [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) for details.
`vmagent` doesn't support `profile` config param and aws credentials file yet.
* `gce_sd_configs` - for scraping targets in Google Compute Engine (GCE).
See [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config) for details.
`vmagent` provides the following additional functionality for `gce_sd_config`:
* if `project` arg is missing, then `vmagent` uses the project for the instance where it runs;
* if `zone` arg is missing, then `vmagent` uses the zone for the instance where it runs;
* if `zone` arg equals to `"*"`, then `vmagent` discovers all the zones for the given project;
* `zone` may contain arbitrary number of zones, i.e. `zone: [us-east1-a, us-east1-b]`.
* `consul_sd_configs` - for scraping targets registered in Consul.
See [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config) for details.
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
* `openstack_sd_configs` - for scraping OpenStack targets.
See [openstack_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#openstack_sd_config) for details.
[OpenStack identity API v3](https://docs.openstack.org/api-ref/identity/v3/) is supported only.
* `dockerswarm_sd_configs` - for scraping Docker Swarm targets.
See [dockerswarm_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dockerswarm_sd_config) for details.
* `eureka_sd_configs` - for scraping targets registered in [Netflix Eureka](https://github.com/Netflix/eureka).
See [eureka_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#eureka_sd_config) for details.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
`vmagent` also support the following additional options in `scrape_config` section:
* `disable_compression: true` - for disabling response compression on a per-job basis. By default `vmagent` requests compressed responses from scrape targets
in order to save network bandwidth.
* `disable_keepalive: true` - for disabling [HTTP keep-alive connections](https://en.wikipedia.org/wiki/HTTP_persistent_connection) on a per-job basis.
By default `vmagent` uses keep-alive connections to scrape targets in order to reduce overhead on connection re-establishing.
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
The file pointed by `-promscrape.config` may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
### Adding labels to metrics
Labels can be added to metrics via the following mechanisms:
* Via `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in `-promscrape.config` file.
* Via `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`.
### Relabeling
`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
Additionally it provides the following extra actions:
* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
* `keep_if_equal`: keeps the entry if all label values from `source_labels` are equal.
* `drop_if_equal`: drops the entry if all the label values from `source_labels` are equal.
The relabeling can be defined in the following places:
* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to target labels.
* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to all the scraped metrics in the given `scrape_config`.
* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to remote storage.
* At `-remoteWrite.urlRelabelConfig` files. This relabeling is applied to metrics before sending them to the corresponding `-remoteWrite.url`.
Read more about relabeling in the following articles:
* [How to use Relabeling in Prometheus and VictoriaMetrics](https://valyala.medium.com/how-to-use-relabeling-in-prometheus-and-victoriametrics-8b90fc22c4b2)
* [Life of a label](https://www.robustperception.io/life-of-a-label)
* [Discarding targets and timeseries with relabeling](https://www.robustperception.io/relabelling-can-discard-targets-timeseries-and-alerts)
* [Dropping labels at scrape time](https://www.robustperception.io/dropping-metrics-at-scrape-time-with-prometheus)
* [Extracting labels from legacy metric names](https://www.robustperception.io/extracting-labels-from-legacy-metric-names)
* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
### Monitoring
`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
Use official [Grafana dashboard](https://grafana.com/grafana/dashboards/12683) for `vmagent` state overview.
If you have suggestions, improvements or found a bug - feel free to open an issue on github or add review to the dashboard.
`vmagent` also exports target statuses at the following handlers:
* `http://vmagent-host:8429/targets`. This handler returns human-readable plaintext status for every active target.
This page is convenient to query from command line with `wget`, `curl` or similar tools.
It accepts optional `show_original_labels=1` query arg, which shows the original labels per each target before applying relabeling.
This information may be useful for debugging target relabeling.
* `http://vmagent-host:8429/api/v1/targets`. This handler returns data compatible with [the corresponding page from Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#targets).
* `http://vmagent-host:8429/ready`. This handler returns http 200 status code when `vmagent` finishes initialization for all service_discovery configs.
It may be useful for performing `vmagent` rolling update without scrape loss.
### Troubleshooting
* It is recommended [setting up the official Grafana dashboard](#monitoring) in order to monitor `vmagent` state.
* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
since `vmagent` establishes at least a single TCP connection per each target.
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
and `http://vmagent-host:8429/api/v1/targets`.
* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default up to `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value in order to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM, so big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if big number of scrape targets are dropped during relabeling.
* If `vmagent` scrapes big number of targets, then `-promscrape.dropOriginalLabels` command-line option may be passed to `vmagent` in order to reduce memory usage.
This option drops `"discoveredLabels"` and `"droppedTargets"` lists at `/api/v1/targets` page, which may result in reduced debuggability for improperly configured per-target relabeling.
* If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
```yml
scrape_configs:
- job_name: 'big-federate'
stream_parse: true
static_configs:
- targets:
- big-prometeus1
- big-prometeus2
honor_labels: true
metrics_path: /federate
params:
'match[]': ['{__name__!=""}']
```
* It is recommended to increase `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* If you see gaps on the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, then try increasing `-remoteWrite.queues`.
Such gaps may appear because `vmagent` cannot keep up with sending the collected data to remote storage, so it starts dropping the buffered data
if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
* By default `vmagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because
the url may contain sensitive information such as auth tokens or passwords.
Pass `-remoteWrite.showURL` command-line flag when starting `vmagent` in order to see all the valid urls.
* If you see `skipping duplicate scrape target with identical labels` errors when scraping Kubernetes pods, then it is likely these pods listen multiple ports
or they use init container. These errors can be either fixed or suppressed with `-promscrape.suppressDuplicateScrapeTargetErrors` command-line flag.
See available options below if you prefer fixing the root cause of the error:
The following `relabel_configs` section may help determining `__meta_*` labels resulting in duplicate targets:
```yml
- action: labelmap
regex: __meta_(.*)
```
The following relabeling rule may be added to `relabel_configs` section in order to filter out pods with unneeded ports:
```yml
- action: keep_if_equal
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_container_port_number]
```
The following relabeling rule may be added to `relabel_configs` section in order to filter out init container pods:
```yml
- action: drop
source_labels: [__meta_kubernetes_pod_container_init]
regex: true
```
### How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent` from the root folder of the repository.
It builds `vmagent` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-prod` from the root folder of the repository.
It builds `vmagent-prod` binary and puts it into the `bin` folder.
#### Building docker images
Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=scratch make package-vmagent
```
#### ARM build
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
#### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmagent-arm` or `make vmagent-arm64` from the root folder of the repository.
It builds `vmagent-arm` or `vmagent-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmagent-arm-prod` or `make vmagent-arm64-prod` from the root folder of the repository.
It builds `vmagent-arm-prod` or `vmagent-arm64-prod` binary respectively and puts it into the `bin` folder.
### Profiling
`vmagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
* Memory profile. It can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/heap > mem.pprof
```
* CPU profile. It can be collected with the following command:
```bash
curl -s http://<vmagent-host>:8429/debug/pprof/profile > cpu.pprof
```
The command for collecting CPU profile waits for 30 seconds before returning.
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).

View File

@@ -0,0 +1,66 @@
package common
import (
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
)
// PushCtx is a context used for populating WriteRequest.
type PushCtx struct {
WriteRequest prompbmarshal.WriteRequest
// Labels contains flat list of all the labels used in WriteRequest.
Labels []prompbmarshal.Label
// Samples contains flat list of all the samples used in WriteRequest.
Samples []prompbmarshal.Sample
}
// Reset resets ctx.
func (ctx *PushCtx) Reset() {
tss := ctx.WriteRequest.Timeseries
for i := range tss {
ts := &tss[i]
ts.Labels = nil
ts.Samples = nil
}
ctx.WriteRequest.Timeseries = ctx.WriteRequest.Timeseries[:0]
promrelabel.CleanLabels(ctx.Labels)
ctx.Labels = ctx.Labels[:0]
ctx.Samples = ctx.Samples[:0]
}
// GetPushCtx returns PushCtx from pool.
//
// Call PutPushCtx when the ctx is no longer needed.
func GetPushCtx() *PushCtx {
select {
case ctx := <-pushCtxPoolCh:
return ctx
default:
if v := pushCtxPool.Get(); v != nil {
return v.(*PushCtx)
}
return &PushCtx{}
}
}
// PutPushCtx returns ctx to the pool.
//
// ctx mustn't be used after returning to the pool.
func PutPushCtx(ctx *PushCtx) {
ctx.Reset()
select {
case pushCtxPoolCh <- ctx:
default:
pushCtxPool.Put(ctx)
}
}
var pushCtxPool sync.Pool
var pushCtxPoolCh = make(chan *PushCtx, cgroup.AvailableCPUs())

View File

@@ -0,0 +1,71 @@
package csvimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="csvimport"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="csvimport"}`)
)
// InsertHandler processes csv data from req.
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
labels = append(labels, extraLabels...)
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8429
ENTRYPOINT ["/vmagent-prod"]
ARG src_binary
COPY $src_binary ./vmagent-prod

View File

@@ -0,0 +1,65 @@
package graphite
import (
"io"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="graphite"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="graphite"}`)
)
// InsertHandler processes remote write for graphite plaintext protocol.
//
// See https://graphite.readthedocs.io/en/latest/feeding-carbon.html#the-plaintext-protocol
func InsertHandler(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, insertRows)
})
}
func insertRows(rows []parser.Row) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -0,0 +1,168 @@
package influx
import (
"flag"
"io"
"net/http"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol")
skipSingleField = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field")
skipMeasurement = flag.Bool("influxSkipMeasurement", false, "Uses '{field_name}' as a metric name while ignoring '{measurement}' and '-influxMeasurementFieldSeparator'")
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="influx"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="influx"}`)
)
// InsertHandlerForReader processes remote write for influx line protocol.
//
// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
func InsertHandlerForReader(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, false, "", "", insertRows)
})
}
// InsertHandlerForHTTP processes remote write for influx line protocol.
//
// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
func InsertHandlerForHTTP(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
q := req.URL.Query()
precision := q.Get("precision")
// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
db := q.Get("db")
return parser.ParseStream(req.Body, isGzipped, precision, db, insertRows)
})
}
func insertRows(db string, rows []parser.Row) error {
ctx := getPushCtx()
defer putPushCtx(ctx)
rowsTotal := 0
tssDst := ctx.ctx.WriteRequest.Timeseries[:0]
labels := ctx.ctx.Labels[:0]
samples := ctx.ctx.Samples[:0]
commonLabels := ctx.commonLabels[:0]
buf := ctx.buf[:0]
for i := range rows {
r := &rows[i]
rowsTotal += len(r.Fields)
commonLabels = commonLabels[:0]
hasDBKey := false
for j := range r.Tags {
tag := &r.Tags[j]
if tag.Key == "db" {
hasDBKey = true
}
commonLabels = append(commonLabels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
if len(db) > 0 && !hasDBKey {
commonLabels = append(commonLabels, prompbmarshal.Label{
Name: "db",
Value: db,
})
}
ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
if !*skipMeasurement {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, r.Measurement...)
}
skipFieldKey := len(r.Fields) == 1 && *skipSingleField
if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
}
for j := range r.Fields {
f := &r.Fields[j]
bufLen := len(buf)
buf = append(buf, ctx.metricGroupBuf...)
if !skipFieldKey {
buf = append(buf, f.Key...)
}
metricGroup := bytesutil.ToUnsafeString(buf[bufLen:])
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: metricGroup,
})
labels = append(labels, commonLabels...)
samples = append(samples, prompbmarshal.Sample{
Timestamp: r.Timestamp,
Value: f.Value,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
}
ctx.buf = buf
ctx.ctx.WriteRequest.Timeseries = tssDst
ctx.ctx.Labels = labels
ctx.ctx.Samples = samples
ctx.commonLabels = commonLabels
remotewrite.Push(&ctx.ctx.WriteRequest)
rowsInserted.Add(rowsTotal)
rowsPerInsert.Update(float64(rowsTotal))
return nil
}
type pushCtx struct {
ctx common.PushCtx
commonLabels []prompbmarshal.Label
metricGroupBuf []byte
buf []byte
}
func (ctx *pushCtx) reset() {
ctx.ctx.Reset()
promrelabel.CleanLabels(ctx.commonLabels)
ctx.commonLabels = ctx.commonLabels[:0]
ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
ctx.buf = ctx.buf[:0]
}
func getPushCtx() *pushCtx {
select {
case ctx := <-pushCtxPoolCh:
return ctx
default:
if v := pushCtxPool.Get(); v != nil {
return v.(*pushCtx)
}
return &pushCtx{}
}
}
func putPushCtx(ctx *pushCtx) {
ctx.reset()
select {
case pushCtxPoolCh <- ctx:
default:
pushCtxPool.Put(ctx)
}
}
var pushCtxPool sync.Pool
var pushCtxPoolCh = make(chan *pushCtx, cgroup.AvailableCPUs())

275
app/vmagent/main.go Normal file
View File

@@ -0,0 +1,275 @@
package main
import (
"flag"
"fmt"
"net/http"
"os"
"strings"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/csvimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/native"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/prometheusimport"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
httpListenAddr = flag.String("httpListenAddr", ":8429", "TCP address to listen for http connections. "+
"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
influxListenAddr = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty. "+
"This flag isn't needed when ingesting data over HTTP - just send it to `http://<vmagent>:8429/write`")
graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
"Usually :4242 must be set. Doesn't work if empty")
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . "+
"Unknown config entries are allowed in -promscrape.config by default. This can be changed with -promscrape.config.strictParse")
)
var (
influxServer *influxserver.Server
graphiteServer *graphiteserver.Server
opentsdbServer *opentsdbserver.Server
opentsdbhttpServer *opentsdbhttpserver.Server
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
remotewrite.InitSecretFlags()
buildinfo.Init()
logger.Init()
if promscrape.IsDryRun() {
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
logger.Infof("-promscrape.config is ok; exitting with 0 status code")
return
}
if *dryRun {
if err := remotewrite.CheckRelabelConfigs(); err != nil {
logger.Fatalf("error when checking relabel configs: %s", err)
}
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking -promscrape.config: %s", err)
}
logger.Infof("all the configs are ok; exitting with 0 status code")
return
}
logger.Infof("starting vmagent at %q...", *httpListenAddr)
startTime := time.Now()
remotewrite.Init()
common.StartUnmarshalWorkers()
writeconcurrencylimiter.Init()
if len(*influxListenAddr) > 0 {
influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
}
if len(*graphiteListenAddr) > 0 {
graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
}
if len(*opentsdbListenAddr) > 0 {
opentsdbServer = opentsdbserver.MustStart(*opentsdbListenAddr, opentsdb.InsertHandler, opentsdbhttp.InsertHandler)
}
if len(*opentsdbHTTPListenAddr) > 0 {
opentsdbhttpServer = opentsdbhttpserver.MustStart(*opentsdbHTTPListenAddr, opentsdbhttp.InsertHandler)
}
promscrape.Init(remotewrite.Push)
if len(*httpListenAddr) > 0 {
go httpserver.Serve(*httpListenAddr, requestHandler)
}
logger.Infof("started vmagent in %.3f seconds", time.Since(startTime).Seconds())
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
startTime = time.Now()
if len(*httpListenAddr) > 0 {
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
}
promscrape.Stop()
if len(*influxListenAddr) > 0 {
influxServer.MustStop()
}
if len(*graphiteListenAddr) > 0 {
graphiteServer.MustStop()
}
if len(*opentsdbListenAddr) > 0 {
opentsdbServer.MustStop()
}
if len(*opentsdbHTTPListenAddr) > 0 {
opentsdbhttpServer.MustStop()
}
common.StopUnmarshalWorkers()
remotewrite.Stop()
logger.Infof("successfully stopped vmagent in %.3f seconds", time.Since(startTime).Seconds())
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
fmt.Fprintf(w, "vmagent - see docs at https://victoriametrics.github.io/vmagent.html")
return true
}
path := strings.Replace(r.URL.Path, "//", "/", -1)
switch path {
case "/api/v1/write":
prometheusWriteRequests.Inc()
if err := promremotewrite.InsertHandler(r); err != nil {
prometheusWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import":
vmimportRequests.Inc()
if err := vmimport.InsertHandler(r); err != nil {
vmimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/csv":
csvimportRequests.Inc()
if err := csvimport.InsertHandler(r); err != nil {
csvimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/prometheus":
prometheusimportRequests.Inc()
if err := prometheusimport.InsertHandler(r); err != nil {
prometheusimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/api/v1/import/native":
nativeimportRequests.Inc()
if err := native.InsertHandler(r); err != nil {
nativeimportErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/write", "/api/v2/write":
influxWriteRequests.Inc()
if err := influx.InsertHandlerForHTTP(r); err != nil {
influxWriteErrors.Inc()
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.WriteHeader(http.StatusNoContent)
return true
case "/query":
// Emulate fake response for influx query.
// This is required for TSBS benchmark.
influxQueryRequests.Inc()
fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
return true
case "/targets":
promscrapeTargetsRequests.Inc()
promscrape.WriteHumanReadableTargetsStatus(w, r)
return true
case "/api/v1/targets":
promscrapeAPIV1TargetsRequests.Inc()
w.Header().Set("Content-Type", "application/json; charset=utf-8")
state := r.FormValue("state")
promscrape.WriteAPIV1Targets(w, state)
return true
case "/-/reload":
promscrapeConfigReloadRequests.Inc()
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
case "/ready":
if rdy := atomic.LoadInt32(&promscrape.PendingScrapeConfigs); rdy > 0 {
errMsg := fmt.Sprintf("waiting for scrapes to init, left: %d", rdy)
http.Error(w, errMsg, http.StatusTooEarly)
} else {
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
}
return true
}
return false
}
var (
prometheusWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/write", protocol="promremotewrite"}`)
prometheusWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/write", protocol="promremotewrite"}`)
vmimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import", protocol="vmimport"}`)
vmimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import", protocol="vmimport"}`)
csvimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/csv", protocol="csvimport"}`)
csvimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/csv", protocol="csvimport"}`)
prometheusimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/prometheus", protocol="prometheusimport"}`)
prometheusimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/prometheus", protocol="prometheusimport"}`)
nativeimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import/native", protocol="nativeimport"}`)
nativeimportErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import/native", protocol="nativeimport"}`)
influxWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/write", protocol="influx"}`)
influxWriteErrors = metrics.NewCounter(`vmagent_http_request_errors_total{path="/write", protocol="influx"}`)
influxQueryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/query", protocol="influx"}`)
promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
promscrapeAPIV1TargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/targets"}`)
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
)
func usage() {
const s = `
vmagent collects metrics data via popular data ingestion protocols and routes it to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmagent.html .
`
flagutil.Usage(s)
}

View File

@@ -0,0 +1,85 @@
package native
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/native"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="native"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="native"}`)
)
// InsertHandler processes `/api/v1/import` request.
//
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, func(block *parser.Block) error {
return insertRows(block, extraLabels)
})
})
}
func insertRows(block *parser.Block, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
// Update rowsInserted and rowsPerInsert before actual inserting,
// since relabeling can prevent from inserting the rows.
rowsLen := len(block.Values)
rowsInserted.Add(rowsLen)
rowsPerInsert.Update(float64(rowsLen))
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
mn := &block.MetricName
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: bytesutil.ToUnsafeString(mn.MetricGroup),
})
for j := range mn.Tags {
tag := &mn.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(tag.Key),
Value: bytesutil.ToUnsafeString(tag.Value),
})
}
labels = append(labels, extraLabels...)
values := block.Values
timestamps := block.Timestamps
if len(timestamps) != len(values) {
logger.Panicf("BUG: len(timestamps)=%d must match len(values)=%d", len(timestamps), len(values))
}
samplesLen := len(samples)
for j, value := range values {
samples = append(samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamps[j],
})
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
return nil
}

View File

@@ -0,0 +1,65 @@
package opentsdb
import (
"io"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="opentsdb"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="opentsdb"}`)
)
// InsertHandler processes remote write for OpenTSDB put protocol.
//
// See http://opentsdb.net/docs/build/html/api_telnet/put.html
func InsertHandler(r io.Reader) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(r, insertRows)
})
}
func insertRows(rows []parser.Row) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -0,0 +1,64 @@
package opentsdbhttp
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdbhttp"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="opentsdbhttp"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="opentsdbhttp"}`)
)
// InsertHandler processes HTTP OpenTSDB put requests.
// See http://opentsdb.net/docs/build/html/api_http/put.html
func InsertHandler(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
})
}
func insertRows(rows []parser.Row) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -0,0 +1,76 @@
package prometheusimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/prometheus"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="prometheus"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="prometheus"}`)
)
// InsertHandler processes `/api/v1/import/prometheus` request.
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
defaultTimestamp, err := parserCommon.GetTimestamp(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
isGzipped := req.Header.Get("Content-Encoding") == "gzip"
return parser.ParseStream(req.Body, defaultTimestamp, isGzipped, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
labelsLen := len(labels)
labels = append(labels, prompbmarshal.Label{
Name: "__name__",
Value: r.Metric,
})
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: tag.Key,
Value: tag.Value,
})
}
labels = append(labels, extraLabels...)
samples = append(samples, prompbmarshal.Sample{
Value: r.Value,
Timestamp: r.Timestamp,
})
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[len(samples)-1:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(len(rows))
rowsPerInsert.Update(float64(len(rows)))
return nil
}

View File

@@ -0,0 +1,67 @@
package promremotewrite
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/promremotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="promremotewrite"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="promremotewrite"}`)
)
// InsertHandler processes remote write for prometheus.
func InsertHandler(req *http.Request) error {
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, insertRows)
})
}
func insertRows(timeseries []prompb.TimeSeries) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
rowsTotal := 0
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range timeseries {
ts := &timeseries[i]
rowsTotal += len(ts.Samples)
labelsLen := len(labels)
for i := range ts.Labels {
label := &ts.Labels[i]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(label.Name),
Value: bytesutil.ToUnsafeString(label.Value),
})
}
samplesLen := len(samples)
for i := range ts.Samples {
sample := &ts.Samples[i]
samples = append(samples, prompbmarshal.Sample{
Value: sample.Value,
Timestamp: sample.Timestamp,
})
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(rowsTotal)
rowsPerInsert.Update(float64(rowsTotal))
return nil
}

View File

@@ -0,0 +1,267 @@
package remotewrite
import (
"bytes"
"crypto/tls"
"encoding/base64"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/metrics"
)
var (
sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", "Timeout for sending a single block of data to -remoteWrite.url")
proxyURL = flagutil.NewArray("remoteWrite.proxyURL", "Optional proxy URL for writing data to -remoteWrite.url. Supported proxies: http, https, socks5. "+
"Example: -remoteWrite.proxyURL=socks5://proxy:1234")
tlsInsecureSkipVerify = flagutil.NewArrayBool("remoteWrite.tlsInsecureSkipVerify", "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flagutil.NewArray("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsKeyFile = flagutil.NewArray("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsCAFile = flagutil.NewArray("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsServerName = flagutil.NewArray("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthUsername = flagutil.NewArray("remoteWrite.basicAuth.username", "Optional basic auth username to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthPassword = flagutil.NewArray("remoteWrite.basicAuth.password", "Optional basic auth password to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
bearerToken = flagutil.NewArray("remoteWrite.bearerToken", "Optional bearer auth token to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
)
type client struct {
sanitizedURL string
remoteWriteURL string
authHeader string
fq *persistentqueue.FastQueue
hc *http.Client
requestDuration *metrics.Histogram
requestsOKCount *metrics.Counter
errorsCount *metrics.Counter
packetsDropped *metrics.Counter
retriesCount *metrics.Counter
wg sync.WaitGroup
stopCh chan struct{}
}
func newClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client {
tlsCfg, err := getTLSConfig(argIdx)
if err != nil {
logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
}
tr := &http.Transport{
Dial: statDial,
TLSClientConfig: tlsCfg,
TLSHandshakeTimeout: 5 * time.Second,
MaxConnsPerHost: 2 * concurrency,
MaxIdleConnsPerHost: 2 * concurrency,
IdleConnTimeout: time.Minute,
WriteBufferSize: 64 * 1024,
}
pURL := proxyURL.GetOptionalArg(argIdx)
if len(pURL) > 0 {
if !strings.Contains(pURL, "://") {
logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: it must start with `http://`, `https://` or `socks5://`", pURL)
}
urlProxy, err := url.Parse(pURL)
if err != nil {
logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: %s", pURL, err)
}
tr.Proxy = http.ProxyURL(urlProxy)
}
authHeader := ""
username := basicAuthUsername.GetOptionalArg(argIdx)
password := basicAuthPassword.GetOptionalArg(argIdx)
if len(username) > 0 || len(password) > 0 {
// See https://en.wikipedia.org/wiki/Basic_access_authentication
token := username + ":" + password
token64 := base64.StdEncoding.EncodeToString([]byte(token))
authHeader = "Basic " + token64
}
token := bearerToken.GetOptionalArg(argIdx)
if len(token) > 0 {
if authHeader != "" {
logger.Fatalf("`-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", token)
}
authHeader = "Bearer " + token
}
c := &client{
sanitizedURL: sanitizedURL,
remoteWriteURL: remoteWriteURL,
authHeader: authHeader,
fq: fq,
hc: &http.Client{
Transport: tr,
Timeout: sendTimeout.GetOptionalArgOrDefault(argIdx, time.Minute),
},
stopCh: make(chan struct{}),
}
c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.sanitizedURL))
c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.sanitizedURL))
c.errorsCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.sanitizedURL))
c.packetsDropped = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_packets_dropped_total{url=%q}`, c.sanitizedURL))
c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.sanitizedURL))
for i := 0; i < concurrency; i++ {
c.wg.Add(1)
go func() {
defer c.wg.Done()
c.runWorker()
}()
}
logger.Infof("initialized client for -remoteWrite.url=%q", c.sanitizedURL)
return c
}
func (c *client) MustStop() {
close(c.stopCh)
c.wg.Wait()
logger.Infof("stopped client for -remoteWrite.url=%q", c.sanitizedURL)
}
func getTLSConfig(argIdx int) (*tls.Config, error) {
c := &promauth.TLSConfig{
CAFile: tlsCAFile.GetOptionalArg(argIdx),
CertFile: tlsCertFile.GetOptionalArg(argIdx),
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
ServerName: tlsServerName.GetOptionalArg(argIdx),
InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(argIdx),
}
if c.CAFile == "" && c.CertFile == "" && c.KeyFile == "" && c.ServerName == "" && !c.InsecureSkipVerify {
return nil, nil
}
cfg, err := promauth.NewConfig(".", nil, "", "", c)
if err != nil {
return nil, fmt.Errorf("cannot populate TLS config: %w", err)
}
tlsCfg := cfg.NewTLSConfig()
return tlsCfg, nil
}
func (c *client) runWorker() {
var ok bool
var block []byte
ch := make(chan struct{})
for {
block, ok = c.fq.MustReadBlock(block[:0])
if !ok {
return
}
go func() {
c.sendBlock(block)
ch <- struct{}{}
}()
select {
case <-ch:
// The block has been sent successfully
continue
case <-c.stopCh:
// c must be stopped. Wait for a while in the hope the block will be sent.
graceDuration := 5 * time.Second
select {
case <-ch:
// The block has been sent successfully.
case <-time.After(graceDuration):
logger.Errorf("couldn't sent block with size %d bytes to %q in %.3f seconds during shutdown; dropping it",
len(block), c.sanitizedURL, graceDuration.Seconds())
}
return
}
}
}
func (c *client) sendBlock(block []byte) {
retryDuration := time.Second
retriesCount := 0
again:
req, err := http.NewRequest("POST", c.remoteWriteURL, bytes.NewBuffer(block))
if err != nil {
logger.Panicf("BUG: unexected error from http.NewRequest(%q): %s", c.sanitizedURL, err)
}
h := req.Header
h.Set("User-Agent", "vmagent")
h.Set("Content-Type", "application/x-protobuf")
h.Set("Content-Encoding", "snappy")
h.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if c.authHeader != "" {
req.Header.Set("Authorization", c.authHeader)
}
startTime := time.Now()
resp, err := c.hc.Do(req)
c.requestDuration.UpdateDuration(startTime)
if err != nil {
c.errorsCount.Inc()
retryDuration *= 2
if retryDuration > time.Minute {
retryDuration = time.Minute
}
logger.Errorf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
len(block), c.sanitizedURL, err, retryDuration.Seconds())
t := time.NewTimer(retryDuration)
select {
case <-c.stopCh:
t.Stop()
return
case <-t.C:
}
c.retriesCount.Inc()
goto again
}
statusCode := resp.StatusCode
if statusCode/100 == 2 {
_ = resp.Body.Close()
c.requestsOKCount.Inc()
return
}
metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc()
if statusCode == 409 {
// Just drop block on 409 status code like Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/873
body, _ := ioutil.ReadAll(resp.Body)
_ = resp.Body.Close()
logger.Errorf("unexpected status code received when sending a block with size %d bytes to %q: #%d; dropping the block like Prometheus does; "+
"response body=%q", len(block), c.sanitizedURL, statusCode, body)
c.packetsDropped.Inc()
return
}
// Unexpected status code returned
retriesCount++
retryDuration *= 2
if retryDuration > time.Minute {
retryDuration = time.Minute
}
body, err := ioutil.ReadAll(resp.Body)
_ = resp.Body.Close()
if err != nil {
logger.Errorf("cannot read response body from %q during retry #%d: %s", c.sanitizedURL, retriesCount, err)
} else {
logger.Errorf("unexpected status code received after sending a block with size %d bytes to %q during retry #%d: %d; response body=%q; "+
"re-sending the block in %.3f seconds", len(block), c.sanitizedURL, retriesCount, statusCode, body, retryDuration.Seconds())
}
t := time.NewTimer(retryDuration)
select {
case <-c.stopCh:
t.Stop()
return
case <-t.C:
}
c.retriesCount.Inc()
goto again
}

View File

@@ -0,0 +1,200 @@
package remotewrite
import (
"flag"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/metrics"
"github.com/golang/snappy"
)
var (
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
"Minimum supported interval is 1 second")
maxUnpackedBlockSize = flagutil.NewBytes("remoteWrite.maxBlockSize", 8*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
)
// the maximum number of rows to send per each block.
const maxRowsPerBlock = 10000
type pendingSeries struct {
mu sync.Mutex
wr writeRequest
stopCh chan struct{}
periodicFlusherWG sync.WaitGroup
}
func newPendingSeries(pushBlock func(block []byte)) *pendingSeries {
var ps pendingSeries
ps.wr.pushBlock = pushBlock
ps.stopCh = make(chan struct{})
ps.periodicFlusherWG.Add(1)
go func() {
defer ps.periodicFlusherWG.Done()
ps.periodicFlusher()
}()
return &ps
}
func (ps *pendingSeries) MustStop() {
close(ps.stopCh)
ps.periodicFlusherWG.Wait()
}
func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
ps.mu.Lock()
ps.wr.push(tss)
ps.mu.Unlock()
}
func (ps *pendingSeries) periodicFlusher() {
flushSeconds := int64(flushInterval.Seconds())
if flushSeconds <= 0 {
flushSeconds = 1
}
ticker := time.NewTicker(*flushInterval)
defer ticker.Stop()
mustStop := false
for !mustStop {
select {
case <-ps.stopCh:
mustStop = true
case <-ticker.C:
if fasttime.UnixTimestamp()-atomic.LoadUint64(&ps.wr.lastFlushTime) < uint64(flushSeconds) {
continue
}
}
ps.mu.Lock()
ps.wr.flush()
ps.mu.Unlock()
}
}
type writeRequest struct {
// Move lastFlushTime to the top of the struct in order to guarantee atomic access on 32-bit architectures.
lastFlushTime uint64
wr prompbmarshal.WriteRequest
pushBlock func(block []byte)
tss []prompbmarshal.TimeSeries
labels []prompbmarshal.Label
samples []prompbmarshal.Sample
buf []byte
}
func (wr *writeRequest) reset() {
wr.wr.Timeseries = nil
for i := range wr.tss {
ts := &wr.tss[i]
ts.Labels = nil
ts.Samples = nil
}
wr.tss = wr.tss[:0]
promrelabel.CleanLabels(wr.labels)
wr.labels = wr.labels[:0]
wr.samples = wr.samples[:0]
wr.buf = wr.buf[:0]
}
func (wr *writeRequest) flush() {
wr.wr.Timeseries = wr.tss
atomic.StoreUint64(&wr.lastFlushTime, fasttime.UnixTimestamp())
pushWriteRequest(&wr.wr, wr.pushBlock)
wr.reset()
}
func (wr *writeRequest) push(src []prompbmarshal.TimeSeries) {
tssDst := wr.tss
for i := range src {
tssDst = append(tssDst, prompbmarshal.TimeSeries{})
wr.copyTimeSeries(&tssDst[len(tssDst)-1], &src[i])
if len(wr.samples) >= maxRowsPerBlock {
wr.tss = tssDst
wr.flush()
tssDst = wr.tss
}
}
wr.tss = tssDst
}
func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
labelsDst := wr.labels
labelsLen := len(wr.labels)
samplesDst := wr.samples
buf := wr.buf
for i := range src.Labels {
labelsDst = append(labelsDst, prompbmarshal.Label{})
dstLabel := &labelsDst[len(labelsDst)-1]
srcLabel := &src.Labels[i]
buf = append(buf, srcLabel.Name...)
dstLabel.Name = bytesutil.ToUnsafeString(buf[len(buf)-len(srcLabel.Name):])
buf = append(buf, srcLabel.Value...)
dstLabel.Value = bytesutil.ToUnsafeString(buf[len(buf)-len(srcLabel.Value):])
}
dst.Labels = labelsDst[labelsLen:]
samplesDst = append(samplesDst, src.Samples...)
dst.Samples = samplesDst[len(samplesDst)-len(src.Samples):]
wr.samples = samplesDst
wr.labels = labelsDst
wr.buf = buf
}
func pushWriteRequest(wr *prompbmarshal.WriteRequest, pushBlock func(block []byte)) {
if len(wr.Timeseries) == 0 {
// Nothing to push
return
}
bb := writeRequestBufPool.Get()
bb.B = prompbmarshal.MarshalWriteRequest(bb.B[:0], wr)
if len(bb.B) <= maxUnpackedBlockSize.N {
zb := snappyBufPool.Get()
zb.B = snappy.Encode(zb.B[:cap(zb.B)], bb.B)
writeRequestBufPool.Put(bb)
if len(zb.B) <= persistentqueue.MaxBlockSize {
pushBlock(zb.B)
blockSizeRows.Update(float64(len(wr.Timeseries)))
blockSizeBytes.Update(float64(len(zb.B)))
snappyBufPool.Put(zb)
return
}
snappyBufPool.Put(zb)
} else {
writeRequestBufPool.Put(bb)
}
// Too big block. Recursively split it into smaller parts.
timeseries := wr.Timeseries
n := len(timeseries) / 2
wr.Timeseries = timeseries[:n]
pushWriteRequest(wr, pushBlock)
wr.Timeseries = timeseries[n:]
pushWriteRequest(wr, pushBlock)
wr.Timeseries = timeseries
}
var (
blockSizeBytes = metrics.NewHistogram(`vmagent_remotewrite_block_size_bytes`)
blockSizeRows = metrics.NewHistogram(`vmagent_remotewrite_block_size_rows`)
)
var writeRequestBufPool bytesutil.ByteBufferPool
var snappyBufPool bytesutil.ByteBufferPool

View File

@@ -0,0 +1,140 @@
package remotewrite
import (
"flag"
"fmt"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
)
var (
unparsedLabelsGlobal = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
relabelConfigPathGlobal = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
"before sending them to -remoteWrite.url. See https://victoriametrics.github.io/vmagent.html#relabeling for details")
relabelConfigPaths = flagutil.NewArray("remoteWrite.urlRelabelConfig", "Optional path to relabel config for the corresponding -remoteWrite.url")
)
var labelsGlobal []prompbmarshal.Label
// CheckRelabelConfigs checks -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig.
func CheckRelabelConfigs() error {
_, err := loadRelabelConfigs()
return err
}
func loadRelabelConfigs() (*relabelConfigs, error) {
var rcs relabelConfigs
if *relabelConfigPathGlobal != "" {
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
if err != nil {
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
}
rcs.global = global
}
if len(*relabelConfigPaths) > len(*remoteWriteURLs) {
return nil, fmt.Errorf("too many -remoteWrite.urlRelabelConfig args: %d; it mustn't exceed the number of -remoteWrite.url args: %d",
len(*relabelConfigPaths), len(*remoteWriteURLs))
}
rcs.perURL = make([][]promrelabel.ParsedRelabelConfig, len(*remoteWriteURLs))
for i, path := range *relabelConfigPaths {
if len(path) == 0 {
// Skip empty relabel config.
continue
}
prc, err := promrelabel.LoadRelabelConfigs(path)
if err != nil {
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
}
rcs.perURL[i] = prc
}
return &rcs, nil
}
type relabelConfigs struct {
global []promrelabel.ParsedRelabelConfig
perURL [][]promrelabel.ParsedRelabelConfig
}
// initLabelsGlobal must be called after parsing command-line flags.
func initLabelsGlobal() {
labelsGlobal = nil
for _, s := range *unparsedLabelsGlobal {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
logger.Fatalf("missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
}
labelsGlobal = append(labelsGlobal, prompbmarshal.Label{
Name: s[:n],
Value: s[n+1:],
})
}
}
func (rctx *relabelCtx) applyRelabeling(tss []prompbmarshal.TimeSeries, extraLabels []prompbmarshal.Label, prcs []promrelabel.ParsedRelabelConfig) []prompbmarshal.TimeSeries {
if len(extraLabels) == 0 && len(prcs) == 0 {
// Nothing to change.
return tss
}
tssDst := tss[:0]
labels := rctx.labels[:0]
for i := range tss {
ts := &tss[i]
labelsLen := len(labels)
labels = append(labels, ts.Labels...)
// extraLabels must be added before applying relabeling according to https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
for j := range extraLabels {
extraLabel := &extraLabels[j]
tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
if tmp != nil {
tmp.Value = extraLabel.Value
} else {
labels = append(labels, *extraLabel)
}
}
labels = promrelabel.ApplyRelabelConfigs(labels, labelsLen, prcs, true)
if len(labels) == labelsLen {
// Drop the current time series, since relabeling removed all the labels.
continue
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: ts.Samples,
})
}
rctx.labels = labels
return tssDst
}
type relabelCtx struct {
// pool for labels, which are used during the relabeling.
labels []prompbmarshal.Label
}
func (rctx *relabelCtx) reset() {
promrelabel.CleanLabels(rctx.labels)
rctx.labels = rctx.labels[:0]
}
var relabelCtxPool = &sync.Pool{
New: func() interface{} {
return &relabelCtx{}
},
}
func getRelabelCtx() *relabelCtx {
return relabelCtxPool.Get().(*relabelCtx)
}
func putRelabelCtx(rctx *relabelCtx) {
rctx.labels = rctx.labels[:0]
relabelCtxPool.Put(rctx)
}

View File

@@ -0,0 +1,276 @@
package remotewrite
import (
"flag"
"fmt"
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
xxhash "github.com/cespare/xxhash/v2"
)
var (
remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
queues = flag.Int("remoteWrite.queues", 4, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+
"isn't enough for sending high volume of collected data to remote storage")
showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
"It is hidden by default, since it can contain sensitive info such as auth key")
maxPendingBytesPerURL = flagutil.NewBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+
"for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+
"Buffered data is stored in ~500MB chunks, so the minimum practical value for this flag is 500000000. "+
"Disk usage is unlimited if the value is set to 0")
significantFigures = flag.Int("remoteWrite.significantFigures", 0, "The number of significant figures to leave in metric values before writing them to remote storage. "+
"See https://en.wikipedia.org/wiki/Significant_figures . Zero value saves all the significant figures. "+
"This option may be used for increasing on-disk compression level for the stored metrics")
)
var rwctxs []*remoteWriteCtx
// Contains the current relabelConfigs.
var allRelabelConfigs atomic.Value
// maxQueues limits the maximum value for `-remoteWrite.queues`. There is no sense in setting too high value,
// since it may lead to high memory usage due to big number of buffers.
var maxQueues = cgroup.AvailableCPUs() * 4
// InitSecretFlags must be called after flag.Parse and before any logging.
func InitSecretFlags() {
if !*showRemoteWriteURL {
// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
flagutil.RegisterSecretFlag("remoteWrite.url")
}
}
// Init initializes remotewrite.
//
// It must be called after flag.Parse().
//
// Stop must be called for graceful shutdown.
func Init() {
if len(*remoteWriteURLs) == 0 {
logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set")
}
if *queues > maxQueues {
*queues = maxQueues
}
if *queues <= 0 {
*queues = 1
}
initLabelsGlobal()
rcs, err := loadRelabelConfigs()
if err != nil {
logger.Fatalf("cannot load relabel configs: %s", err)
}
allRelabelConfigs.Store(rcs)
maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
if maxInmemoryBlocks > 200 {
// There is no much sense in keeping higher number of blocks in memory,
// since this means that the producer outperforms consumer and the queue
// will continue growing. It is better storing the queue to file.
maxInmemoryBlocks = 200
}
if maxInmemoryBlocks < 2 {
maxInmemoryBlocks = 2
}
for i, remoteWriteURL := range *remoteWriteURLs {
sanitizedURL := fmt.Sprintf("%d:secret-url", i+1)
if *showRemoteWriteURL {
sanitizedURL = fmt.Sprintf("%d:%s", i+1, remoteWriteURL)
}
rwctx := newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, sanitizedURL)
rwctxs = append(rwctxs, rwctx)
}
// Start config reloader.
sighupCh := procutil.NewSighupChan()
configReloaderWG.Add(1)
go func() {
defer configReloaderWG.Done()
for {
select {
case <-sighupCh:
case <-stopCh:
return
}
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
rcs, err := loadRelabelConfigs()
if err != nil {
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
continue
}
allRelabelConfigs.Store(rcs)
logger.Infof("Successfully reloaded relabel configs")
}
}()
}
var stopCh = make(chan struct{})
var configReloaderWG sync.WaitGroup
// Stop stops remotewrite.
//
// It is expected that nobody calls Push during and after the call to this func.
func Stop() {
close(stopCh)
configReloaderWG.Wait()
for _, rwctx := range rwctxs {
rwctx.MustStop()
}
rwctxs = nil
}
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
//
// Note that wr may be modified by Push due to relabeling and rounding.
func Push(wr *prompbmarshal.WriteRequest) {
if *significantFigures > 0 {
// Round values according to significantFigures
for i := range wr.Timeseries {
samples := wr.Timeseries[i].Samples
for j := range samples {
s := &samples[j]
s.Value = decimal.Round(s.Value, *significantFigures)
}
}
}
var rctx *relabelCtx
rcs := allRelabelConfigs.Load().(*relabelConfigs)
prcsGlobal := rcs.global
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
rctx = getRelabelCtx()
}
tss := wr.Timeseries
for len(tss) > 0 {
// Process big tss in smaller blocks in order to reduce the maximum memory usage
samplesCount := 0
i := 0
for i < len(tss) {
samplesCount += len(tss[i].Samples)
i++
if samplesCount > maxRowsPerBlock {
break
}
}
tssBlock := tss
if i < len(tss) {
tssBlock = tss[:i]
tss = tss[i:]
} else {
tss = nil
}
if rctx != nil {
tssBlockLen := len(tssBlock)
tssBlock = rctx.applyRelabeling(tssBlock, labelsGlobal, prcsGlobal)
globalRelabelMetricsDropped.Add(tssBlockLen - len(tssBlock))
}
for _, rwctx := range rwctxs {
rwctx.Push(tssBlock)
}
if rctx != nil {
rctx.reset()
}
}
if rctx != nil {
putRelabelCtx(rctx)
}
}
var globalRelabelMetricsDropped = metrics.NewCounter("vmagent_remotewrite_global_relabel_metrics_dropped_total")
type remoteWriteCtx struct {
idx int
fq *persistentqueue.FastQueue
c *client
pss []*pendingSeries
pssNextIdx uint64
relabelMetricsDropped *metrics.Counter
}
func newRemoteWriteCtx(argIdx int, remoteWriteURL string, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx {
h := xxhash.Sum64([]byte(remoteWriteURL))
path := fmt.Sprintf("%s/persistent-queue/%d_%016X", *tmpDataPath, argIdx+1, h)
fq := persistentqueue.MustOpenFastQueue(path, sanitizedURL, maxInmemoryBlocks, maxPendingBytesPerURL.N)
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, path, sanitizedURL), func() float64 {
return float64(fq.GetPendingBytes())
})
_ = metrics.GetOrCreateGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, path, sanitizedURL), func() float64 {
return float64(fq.GetInmemoryQueueLen())
})
c := newClient(argIdx, remoteWriteURL, sanitizedURL, fq, *queues)
pss := make([]*pendingSeries, *queues)
for i := range pss {
pss[i] = newPendingSeries(fq.MustWriteBlock)
}
return &remoteWriteCtx{
idx: argIdx,
fq: fq,
c: c,
pss: pss,
relabelMetricsDropped: metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_relabel_metrics_dropped_total{path=%q, url=%q}`, path, sanitizedURL)),
}
}
func (rwctx *remoteWriteCtx) MustStop() {
for _, ps := range rwctx.pss {
ps.MustStop()
}
rwctx.idx = 0
rwctx.pss = nil
rwctx.fq.MustClose()
rwctx.fq = nil
rwctx.c.MustStop()
rwctx.c = nil
rwctx.relabelMetricsDropped = nil
}
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
var rctx *relabelCtx
var v *[]prompbmarshal.TimeSeries
rcs := allRelabelConfigs.Load().(*relabelConfigs)
prcs := rcs.perURL[rwctx.idx]
if len(prcs) > 0 {
rctx = getRelabelCtx()
// Make a copy of tss before applying relabeling in order to prevent
// from affecting time series for other remoteWrite.url configs.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/599
v = tssRelabelPool.Get().(*[]prompbmarshal.TimeSeries)
tss = append(*v, tss...)
tssLen := len(tss)
tss = rctx.applyRelabeling(tss, nil, prcs)
rwctx.relabelMetricsDropped.Add(tssLen - len(tss))
}
pss := rwctx.pss
idx := atomic.AddUint64(&rwctx.pssNextIdx, 1) % uint64(len(pss))
pss[idx].Push(tss)
if rctx != nil {
*v = prompbmarshal.ResetTimeSeries(tss)
tssRelabelPool.Put(v)
putRelabelCtx(rctx)
}
}
var tssRelabelPool = &sync.Pool{
New: func() interface{} {
a := []prompbmarshal.TimeSeries{}
return &a
},
}

View File

@@ -0,0 +1,81 @@
package remotewrite
import (
"fmt"
"net"
"strings"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
func statDial(network, addr string) (conn net.Conn, err error) {
if !strings.HasPrefix(network, "tcp") {
return nil, fmt.Errorf("unexpected network passed to statDial: %q; it must start from `tcp`", network)
}
if netutil.TCP6Enabled() {
conn, err = fasthttp.DialDualStack(addr)
} else {
conn, err = fasthttp.Dial(addr)
}
dialsTotal.Inc()
if err != nil {
dialErrors.Inc()
return nil, err
}
conns.Inc()
sc := &statConn{
Conn: conn,
}
return sc, nil
}
var (
dialsTotal = metrics.NewCounter(`vmagent_remotewrite_dials_total`)
dialErrors = metrics.NewCounter(`vmagent_remotewrite_dial_errors_total`)
conns = metrics.NewCounter(`vmagent_remotewrite_conns`)
)
type statConn struct {
closed uint64
net.Conn
}
func (sc *statConn) Read(p []byte) (int, error) {
n, err := sc.Conn.Read(p)
connReadsTotal.Inc()
if err != nil {
connReadErrors.Inc()
}
connBytesRead.Add(n)
return n, err
}
func (sc *statConn) Write(p []byte) (int, error) {
n, err := sc.Conn.Write(p)
connWritesTotal.Inc()
if err != nil {
connWriteErrors.Inc()
}
connBytesWritten.Add(n)
return n, err
}
func (sc *statConn) Close() error {
err := sc.Conn.Close()
if atomic.AddUint64(&sc.closed, 1) == 1 {
conns.Dec()
}
return err
}
var (
connReadsTotal = metrics.NewCounter(`vmagent_remotewrite_conn_reads_total`)
connWritesTotal = metrics.NewCounter(`vmagent_remotewrite_conn_writes_total`)
connReadErrors = metrics.NewCounter(`vmagent_remotewrite_conn_read_errors_total`)
connWriteErrors = metrics.NewCounter(`vmagent_remotewrite_conn_write_errors_total`)
connBytesRead = metrics.NewCounter(`vmagent_remotewrite_conn_bytes_read_total`)
connBytesWritten = metrics.NewCounter(`vmagent_remotewrite_conn_bytes_written_total`)
)

BIN
app/vmagent/vmagent.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View File

@@ -0,0 +1,81 @@
package vmimport
import (
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
parserCommon "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/vmimport"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
"github.com/VictoriaMetrics/metrics"
)
var (
rowsInserted = metrics.NewCounter(`vmagent_rows_inserted_total{type="vmimport"}`)
rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="vmimport"}`)
)
// InsertHandler processes `/api/v1/import` request.
//
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
func InsertHandler(req *http.Request) error {
extraLabels, err := parserCommon.GetExtraLabels(req)
if err != nil {
return err
}
return writeconcurrencylimiter.Do(func() error {
return parser.ParseStream(req, func(rows []parser.Row) error {
return insertRows(rows, extraLabels)
})
})
}
func insertRows(rows []parser.Row, extraLabels []prompbmarshal.Label) error {
ctx := common.GetPushCtx()
defer common.PutPushCtx(ctx)
rowsTotal := 0
tssDst := ctx.WriteRequest.Timeseries[:0]
labels := ctx.Labels[:0]
samples := ctx.Samples[:0]
for i := range rows {
r := &rows[i]
rowsTotal += len(r.Values)
labelsLen := len(labels)
for j := range r.Tags {
tag := &r.Tags[j]
labels = append(labels, prompbmarshal.Label{
Name: bytesutil.ToUnsafeString(tag.Key),
Value: bytesutil.ToUnsafeString(tag.Value),
})
}
labels = append(labels, extraLabels...)
values := r.Values
timestamps := r.Timestamps
if len(timestamps) != len(values) {
logger.Panicf("BUG: len(timestamps)=%d must match len(values)=%d", len(timestamps), len(values))
}
samplesLen := len(samples)
for j, value := range values {
samples = append(samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamps[j],
})
}
tssDst = append(tssDst, prompbmarshal.TimeSeries{
Labels: labels[labelsLen:],
Samples: samples[samplesLen:],
})
}
ctx.WriteRequest.Timeseries = tssDst
ctx.Labels = labels
ctx.Samples = samples
remotewrite.Push(&ctx.WriteRequest)
rowsInserted.Add(rowsTotal)
rowsPerInsert.Update(float64(rowsTotal))
return nil
}

90
app/vmalert/Makefile Normal file
View File

@@ -0,0 +1,90 @@
# All these commands must run from repository root.
vmalert:
APP_NAME=vmalert $(MAKE) app-local
vmalert-race:
APP_NAME=vmalert RACE=-race $(MAKE) app-local
vmalert-prod:
APP_NAME=vmalert $(MAKE) app-via-docker
vmalert-pure-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-pure
vmalert-amd64-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-amd64
vmalert-arm-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-arm
vmalert-arm64-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-arm64
vmalert-ppc64le-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-ppc64le
vmalert-386-prod:
APP_NAME=vmalert $(MAKE) app-via-docker-386
package-vmalert:
APP_NAME=vmalert $(MAKE) package-via-docker
package-vmalert-pure:
APP_NAME=vmalert $(MAKE) package-via-docker-pure
package-vmalert-amd64:
APP_NAME=vmalert $(MAKE) package-via-docker-amd64
package-vmalert-arm:
APP_NAME=vmalert $(MAKE) package-via-docker-arm
package-vmalert-arm64:
APP_NAME=vmalert $(MAKE) package-via-docker-arm64
package-vmalert-ppc64le:
APP_NAME=vmalert $(MAKE) package-via-docker-ppc64le
package-vmalert-386:
APP_NAME=vmalert $(MAKE) package-via-docker-386
publish-vmalert:
APP_NAME=vmalert $(MAKE) publish-via-docker
test-vmalert:
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
go test -v -race -cover ./app/vmalert/datasource
go test -v -race -cover ./app/vmalert/notifier
go test -v -race -cover ./app/vmalert/config
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/config/testdata/rules2-good.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 \
-notifier.url=http://127.0.0.1:9093 \
-remoteWrite.url=http://localhost:8428 \
-remoteRead.url=http://localhost:8428 \
-external.label=cluster=east-1 \
-external.label=replica=a \
-evaluationInterval=3s
vmalert-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmalert-local-with-goarch
vmalert-arm:
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmalert-local-with-goarch
vmalert-arm64:
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmalert-local-with-goarch
vmalert-ppc64le:
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmalert-local-with-goarch
vmalert-386:
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmalert-local-with-goarch
vmalert-local-with-goarch:
APP_NAME=vmalert $(MAKE) app-local-with-goarch
vmalert-pure:
APP_NAME=vmalert $(MAKE) app-local-pure

378
app/vmalert/README.md Normal file
View File

@@ -0,0 +1,378 @@
## vmalert
`vmalert` executes a list of given [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/)
or [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
rules against configured address.
### Features:
* Integration with [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics) TSDB;
* VictoriaMetrics [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
support and expressions validation;
* Prometheus [alerting rules definition format](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#defining-alerting-rules)
support;
* Integration with [Alertmanager](https://github.com/prometheus/alertmanager);
* Keeps the alerts [state on restarts](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmalert#alerts-state-on-restarts);
* Lightweight without extra dependencies.
### Limitations:
* `vmalert` execute queries against remote datasource which has reliability risks because of network.
It is recommended to configure alerts thresholds and rules expressions with understanding that network request
may fail;
* by default, rules execution is sequential within one group, but persisting of execution results to remote
storage is asynchronous. Hence, user shouldn't rely on recording rules chaining when result of previous
recording rule is reused in next one;
* `vmalert` has no UI, just an API for getting groups and rules statuses.
### QuickStart
To build `vmalert` from sources:
```
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
make vmalert
```
The build binary will be placed to `VictoriaMetrics/bin` folder.
To start using `vmalert` you will need the following things:
* list of rules - PromQL/MetricsQL expressions to execute;
* datasource address - reachable VictoriaMetrics instance for rules execution;
* notifier address - reachable [Alert Manager](https://github.com/prometheus/alertmanager) instance for processing,
aggregating alerts and sending notifications.
* remote write address - [remote write](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations)
compatible storage address for storing recording rules results and alerts state in for of timeseries. This is optional.
Then configure `vmalert` accordingly:
```
./bin/vmalert -rule=alert.rules \
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
-notifier.url=http://localhost:9093 \ # AlertManager URL
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
-external.label=cluster=east-1 \ # External label to be applied for each rule
-external.label=replica=a \ # Multiple external labels may be set
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
```
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
in [testdata](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmalert/config/testdata) folder.
Every `rule` belongs to `group` and every configuration file may contain arbitrary number of groups:
```yaml
groups:
[ - <rule_group> ]
```
#### Groups
Each group has following attributes:
```yaml
# The name of the group. Must be unique within a file.
name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = global.evaluation_interval ]
# How many rules execute at once. Increasing concurrency may speed
# up round execution speed.
[ concurrency: <integer> | default = 1 ]
rules:
[ - <rule> ... ]
```
#### Rules
There are two types of Rules:
* [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -
Alerting rules allows to define alert conditions via [MetricsQL](https://victoriametrics.github.io/MetricsQL.html)
and to send notifications about firing alerts to [Alertmanager](https://github.com/prometheus/alertmanager).
* [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) -
Recording rules allow you to precompute frequently needed or computationally expensive expressions
and save their result as a new set of time series.
`vmalert` forbids to define duplicates - rules with the same combination of name, expression and labels
within one group.
##### Alerting rules
The syntax for alerting rule is following:
```yaml
# The name of the alert. Must be a valid metric name.
alert: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Alerts are considered firing once they have been returned for this long.
# Alerts which have not yet fired for long enough are considered pending.
[ for: <duration> | default = 0s ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]
# Annotations to add to each alert.
annotations:
[ <labelname>: <tmpl_string> ]
```
##### Recording rules
The syntax for recording rules is following:
```yaml
# The name of the time series to output to. Must be a valid metric name.
record: <string>
# The MetricsQL expression to evaluate.
expr: <string>
# Labels to add or overwrite before storing the result.
labels:
[ <labelname>: <labelvalue> ]
```
For recording rules to work `-remoteWrite.url` must specified.
#### Alerts state on restarts
`vmalert` has no local storage, so alerts state is stored in the process memory. Hence, after reloading of `vmalert`
the process alerts state will be lost. To avoid this situation, `vmalert` should be configured via the following flags:
* `-remoteWrite.url` - URL to VictoriaMetrics (Single) or VMInsert (Cluster). `vmalert` will persist alerts state
into the configured address in the form of time series named `ALERTS` and `ALERTS_FOR_STATE` via remote-write protocol.
These are regular time series and may be queried from VM just as any other time series.
The state stored to the configured address on every rule evaluation.
* `-remoteRead.url` - URL to VictoriaMetrics (Single) or VMSelect (Cluster). `vmalert` will try to restore alerts state
from configured address by querying time series with name `ALERTS_FOR_STATE`.
Both flags are required for the proper state restoring. Restore process may fail if time series are missing
in configured `-remoteRead.url`, weren't updated in the last `1h` or received state doesn't match current `vmalert`
rules configuration.
#### WEB
`vmalert` runs a web-server (`-httpListenAddr`) for serving metrics and alerts endpoints:
* `http://<vmalert-addr>/api/v1/groups` - list of all loaded groups and rules;
* `http://<vmalert-addr>/api/v1/alerts` - list of all active alerts;
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
### Configuration
The shortlist of configuration flags is the following:
```
-datasource.basicAuth.password string
Optional basic auth password for -datasource.url
-datasource.basicAuth.username string
Optional basic auth username for -datasource.url
-datasource.lookback duration
Lookback defines how far to look into past when evaluating queries. For example, if datasource.lookback=5m then param "time" with value now()-5m will be added to every query.
-datasource.maxIdleConnections int
Defines the number of idle (keep-alive connections) to configured datasource.Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state. (default 100)
-datasource.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -datasource.url. By default system CA is used
-datasource.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -datasource.url
-datasource.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -datasource.url
-datasource.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -datasource.url
-datasource.tlsServerName string
Optional TLS server name to use for connections to -datasource.url. By default the server name from -datasource.url is used
-datasource.url string
Victoria Metrics or VMSelect url. Required parameter. E.g. http://127.0.0.1:8428
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-evaluationInterval duration
How often to evaluate the rules (default 1m0s)
-external.alert.source string
External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used
-external.label array
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
Supports array of values separated by comma or specified via multiple flags.
-external.url string
External URL is used as alert's source for sent alerts to the notifier
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
Address to listen for http connections (default ":8880")
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit (default 10)
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-notifier.basicAuth.password array
Optional basic auth password for -datasource.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.basicAuth.username array
Optional basic auth username for -datasource.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsCAFile array
Optional path to TLS CA file to use for verifying connections to -notifier.url. By default system CA is used
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsCertFile array
Optional path to client-side TLS certificate file to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -notifier.url
-notifier.tlsKeyFile array
Optional path to client-side TLS certificate key to use when connecting to -notifier.url
Supports array of values separated by comma or specified via multiple flags.
-notifier.tlsServerName array
Optional TLS server name to use for connections to -notifier.url. By default the server name from -notifier.url is used
Supports array of values separated by comma or specified via multiple flags.
-notifier.url array
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
Supports array of values separated by comma or specified via multiple flags.
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
-remoteRead.basicAuth.username string
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteRead.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteRead.url. By default system CA is used
-remoteRead.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url
-remoteRead.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteRead.url
-remoteRead.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url
-remoteRead.tlsServerName string
Optional TLS server name to use for connections to -remoteRead.url. By default the server name from -remoteRead.url is used
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
-remoteWrite.concurrency int
Defines number of writers for concurrent writing into remote querier (default 1)
-remoteWrite.flushInterval duration
Defines interval of flushes to remote write endpoint (default 5s)
-remoteWrite.maxBatchSize int
Defines defines max number of timeseries to be flushed at once (default 1000)
-remoteWrite.maxQueueSize int
Defines the max number of pending datapoints to remote write endpoint (default 100000)
-remoteWrite.tlsCAFile string
Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. By default system CA is used
-remoteWrite.tlsCertFile string
Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url
-remoteWrite.tlsInsecureSkipVerify
Whether to skip tls verification when connecting to -remoteWrite.url
-remoteWrite.tlsKeyFile string
Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url
-remoteWrite.tlsServerName string
Optional TLS server name to use for connections to -remoteWrite.url. By default the server name from -remoteWrite.url is used
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428
-rule array
Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.
Supports array of values separated by comma or specified via multiple flags.
-rule.validateExpressions
Whether to validate rules expressions via MetricsQL engine (default true)
-rule.validateTemplates
Whether to validate annotation and label templates (default true)
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
```
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
To reload configuration without `vmalert` restart send SIGHUP signal
or send GET request to `/-/reload` endpoint.
### Contributing
`vmalert` is mostly designed and built by VictoriaMetrics community.
Feel free to share your experience and ideas for improving this
software. Please keep simplicity as the main priority.
### How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
- `vmalert` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert` from the root folder of the repository.
It builds `vmalert` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-prod` from the root folder of the repository.
It builds `vmalert-prod` binary and puts it into the `bin` folder.
#### ARM build
ARM build may run on Raspberry Pi or on [energy-efficient ARM servers](https://blog.cloudflare.com/arm-takes-wing/).
#### Development ARM build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert-arm` or `make vmalert-arm64` from the root folder of the repository.
It builds `vmalert-arm` or `vmalert-arm64` binary respectively and puts it into the `bin` folder.
#### Production ARM build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-arm-prod` or `make vmalert-arm64-prod` from the root folder of the repository.
It builds `vmalert-arm-prod` or `vmalert-arm64-prod` binary respectively and puts it into the `bin` folder.

437
app/vmalert/alerting.go Normal file
View File

@@ -0,0 +1,437 @@
package main
import (
"context"
"fmt"
"hash/fnv"
"sort"
"strconv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
// AlertingRule is basic alert entity
type AlertingRule struct {
RuleID uint64
Name string
Expr string
For time.Duration
Labels map[string]string
Annotations map[string]string
GroupID uint64
GroupName string
// guard status fields
mu sync.RWMutex
// stores list of active alerts
alerts map[uint64]*notifier.Alert
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
metrics *alertingRuleMetrics
}
type alertingRuleMetrics struct {
errors *gauge
pending *gauge
active *gauge
}
func newAlertingRule(group *Group, cfg config.Rule) *AlertingRule {
ar := &AlertingRule{
RuleID: cfg.ID,
Name: cfg.Alert,
Expr: cfg.Expr,
For: cfg.For.Duration(),
Labels: cfg.Labels,
Annotations: cfg.Annotations,
GroupID: group.ID(),
GroupName: group.Name,
alerts: make(map[uint64]*notifier.Alert),
metrics: &alertingRuleMetrics{},
}
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StatePending {
num++
}
}
return float64(num)
})
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
var num int
for _, a := range ar.alerts {
if a.State == notifier.StateFiring {
num++
}
}
return float64(num)
})
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels),
func() float64 {
ar.mu.Lock()
defer ar.mu.Unlock()
if ar.lastExecError == nil {
return 0
}
return 1
})
return ar
}
// Close unregisters rule metrics
func (ar *AlertingRule) Close() {
metrics.UnregisterMetric(ar.metrics.active.name)
metrics.UnregisterMetric(ar.metrics.pending.name)
metrics.UnregisterMetric(ar.metrics.errors.name)
}
// String implements Stringer interface
func (ar *AlertingRule) String() string {
return ar.Name
}
// ID returns unique Rule ID
// within the parent Group.
func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
// Exec executes AlertingRule expression via the given Querier.
// Based on the Querier results AlertingRule maintains notifier.Alerts
func (ar *AlertingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
qMetrics, err := q.Query(ctx, ar.Expr)
ar.mu.Lock()
defer ar.mu.Unlock()
ar.lastExecError = err
ar.lastExecTime = time.Now()
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive {
delete(ar.alerts, h)
}
}
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query) }
updated := make(map[uint64]struct{})
// update list of active alerts
for _, m := range qMetrics {
for k, v := range ar.Labels {
// apply extra labels
m.SetLabel(k, v)
}
h := hash(m)
if _, ok := updated[h]; ok {
// duplicate may be caused by extra labels
// conflicting with the metric labels
return nil, fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
}
updated[h] = struct{}{}
if a, ok := ar.alerts[h]; ok {
if a.Value != m.Value {
// update Value field with latest value
a.Value = m.Value
// and re-exec template since Value can be used
// in templates
err = ar.template(a, qFn)
if err != nil {
return nil, err
}
}
continue
}
a, err := ar.newAlert(m, ar.lastExecTime, qFn)
if err != nil {
ar.lastExecError = err
return nil, fmt.Errorf("failed to create alert: %w", err)
}
a.ID = h
a.State = notifier.StatePending
ar.alerts[h] = a
}
for h, a := range ar.alerts {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending {
// alert was in Pending state - it is not
// active anymore
delete(ar.alerts, h)
continue
}
a.State = notifier.StateInactive
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
}
if series {
return ar.toTimeSeries(ar.lastExecTime), nil
}
return nil, nil
}
func (ar *AlertingRule) toTimeSeries(timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
for _, a := range ar.alerts {
if a.State == notifier.StateInactive {
continue
}
ts := ar.alertToTimeSeries(a, timestamp)
tss = append(tss, ts...)
}
return tss
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (ar *AlertingRule) UpdateWith(r Rule) error {
nr, ok := r.(*AlertingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
}
ar.Expr = nr.Expr
ar.For = nr.For
ar.Labels = nr.Labels
ar.Annotations = nr.Annotations
return nil
}
// TODO: consider hashing algorithm in VM
func hash(m datasource.Metric) uint64 {
hash := fnv.New64a()
labels := m.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
a := &notifier.Alert{
GroupID: ar.GroupID,
Name: ar.Name,
Labels: map[string]string{},
Value: m.Value,
Start: start,
Expr: ar.Expr,
}
// label defined here to make override possible by
// time series labels.
a.Labels[alertGroupNameLabel] = ar.GroupName
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
continue
}
a.Labels[l.Name] = l.Value
}
return a, ar.template(a, qFn)
}
func (ar *AlertingRule) template(a *notifier.Alert, qFn notifier.QueryFn) error {
var err error
a.Labels, err = a.ExecTemplate(qFn, a.Labels)
if err != nil {
return err
}
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
return err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
ar.mu.RLock()
defer ar.mu.RUnlock()
a, ok := ar.alerts[id]
if !ok {
return nil
}
return ar.newAlertAPI(*a)
}
// RuleAPI returns Rule representation in form
// of APIAlertingRule
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
var lastErr string
if ar.lastExecError != nil {
lastErr = ar.lastExecError.Error()
}
return APIAlertingRule{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", ar.ID()),
GroupID: fmt.Sprintf("%d", ar.GroupID),
Name: ar.Name,
Expression: ar.Expr,
For: ar.For.String(),
LastError: lastErr,
LastExec: ar.lastExecTime,
Labels: ar.Labels,
Annotations: ar.Annotations,
}
}
// AlertsAPI generates list of APIAlert objects from existing alerts
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
var alerts []*APIAlert
ar.mu.RLock()
for _, a := range ar.alerts {
alerts = append(alerts, ar.newAlertAPI(*a))
}
ar.mu.RUnlock()
return alerts
}
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
return &APIAlert{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
Name: a.Name,
Expression: ar.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
State: a.State.String(),
ActiveAt: a.Start,
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
}
}
const (
// alertMetricName is the metric name for synthetic alert timeseries.
alertMetricName = "ALERTS"
// alertForStateMetricName is the metric name for 'for' state of alert.
alertForStateMetricName = "ALERTS_FOR_STATE"
// alertNameLabel is the label name indicating the name of an alert.
alertNameLabel = "alertname"
// alertStateLabel is the label name indicating the state of an alert.
alertStateLabel = "alertstate"
// alertGroupNameLabel defines the label name attached for generated time series.
alertGroupNameLabel = "alertgroup"
)
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp time.Time) []prompbmarshal.TimeSeries {
var tss []prompbmarshal.TimeSeries
tss = append(tss, alertToTimeSeries(ar.Name, a, timestamp))
if ar.For > 0 {
tss = append(tss, alertForToTimeSeries(ar.Name, a, timestamp))
}
return tss
}
func alertToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertMetricName
labels[alertNameLabel] = name
labels[alertStateLabel] = a.State.String()
return newTimeSeries(1, labels, timestamp)
}
// alertForToTimeSeries returns a timeseries that represents
// state of active alerts, where value is time when alert become active
func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for k, v := range a.Labels {
labels[k] = v
}
labels["__name__"] = alertForStateMetricName
labels[alertNameLabel] = name
return newTimeSeries(float64(a.Start.Unix()), labels, timestamp)
}
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
// Only rules with For > 0 will be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
qFn := func(query string) ([]datasource.Metric, error) { return q.Query(ctx, query) }
// account for external labels in filter
var labelsFilter string
for k, v := range labels {
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
}
// Get the last data point in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err
}
for _, m := range qMetrics {
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to time series received in Exec
for _, l := range labels {
if l.Name == alertNameLabel || l.Name == alertGroupNameLabel {
continue
}
m.Labels = append(m.Labels, l)
}
a, err := ar.newAlert(m, time.Unix(int64(m.Value), 0), qFn)
if err != nil {
return fmt.Errorf("failed to create alert: %w", err)
}
a.ID = hash(m)
a.State = notifier.StatePending
ar.alerts[a.ID] = a
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
}
return nil
}

View File

@@ -0,0 +1,478 @@
package main
import (
"context"
"errors"
"strings"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestAlertingRule_ToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *AlertingRule
alert *notifier.Alert
expTS []prompbmarshal.TimeSeries
}{
{
newTestAlertingRule("instant", 0),
&notifier.Alert{State: notifier.StateFiring},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant",
}, timestamp),
},
},
{
newTestAlertingRule("instant extra labels", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
"job": "foo",
"instance": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant extra labels",
"job": "foo",
"instance": "bar",
}, timestamp),
},
},
{
newTestAlertingRule("instant labels override", 0),
&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
alertStateLabel: "foo",
"__name__": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "instant labels override",
}, timestamp),
},
},
{
newTestAlertingRule("for", time.Second),
&notifier.Alert{State: notifier.StateFiring, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
alertNameLabel: "for",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for",
}, timestamp),
},
},
{
newTestAlertingRule("for pending", 10*time.Second),
&notifier.Alert{State: notifier.StatePending, Start: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StatePending.String(),
alertNameLabel: "for pending",
}, timestamp),
newTimeSeries(float64(timestamp.Add(time.Second).Unix()), map[string]string{
"__name__": alertForStateMetricName,
alertNameLabel: "for pending",
}, timestamp),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
tc.rule.alerts[tc.alert.ID] = tc.alert
tss := tc.rule.toTimeSeries(timestamp)
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func TestAlertingRule_Exec(t *testing.T) {
const defaultStep = 5 * time.Millisecond
testCases := []struct {
rule *AlertingRule
steps [][]datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestAlertingRule("empty", 0),
[][]datasource.Metric{},
map[uint64]*notifier.Alert{},
},
{
newTestAlertingRule("empty labels", 0),
[][]datasource.Metric{
{datasource.Metric{}},
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("single-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("single-firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestAlertingRule("single-firing=>inactive=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("single-firing=>inactive=>firing=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
},
map[uint64]*notifier.Alert{},
},
{
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>empty=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("multiple-firing", 0),
[][]datasource.Metric{
{
metricWithLabels(t, "name", "foo"),
metricWithLabels(t, "name", "foo1"),
metricWithLabels(t, "name", "foo2"),
},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateFiring},
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("multiple-steps-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo1")},
{metricWithLabels(t, "name", "foo2")},
},
// 1: fire first alert
// 2: fire second alert, set first inactive
// 3: fire third alert, set second inactive, delete first one
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo1")): {State: notifier.StateInactive},
hash(metricWithLabels(t, "name", "foo2")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("for-pending", time.Minute),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
},
},
{
newTestAlertingRule("for-fired", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
{
newTestAlertingRule("for-pending=>empty", time.Second),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset and delete pending alerts
{},
},
map[uint64]*notifier.Alert{},
},
{
newTestAlertingRule("for-pending=>firing=>inactive", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
},
{
newTestAlertingRule("for-pending=>firing=>inactive=>pending", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StatePending},
},
},
{
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
{},
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateFiring},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
for _, step := range tc.steps {
fq.reset()
fq.add(step...)
if _, err := tc.rule.Exec(context.TODO(), fq, false); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// artificial delay between applying steps
time.Sleep(defaultStep)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
}
for key, exp := range tc.expAlerts {
got, ok := tc.rule.alerts[key]
if !ok {
t.Fatalf("expected to have key %d", key)
}
if got.State != exp.State {
t.Fatalf("expected state %d; got %d", exp.State, got.State)
}
}
})
}
}
func TestAlertingRule_Restore(t *testing.T) {
testCases := []struct {
rule *AlertingRule
metrics []datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
newTestRuleWithLabels("no extra labels"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
),
},
map[uint64]*notifier.Alert{
hash(datasource.Metric{}): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("metric labels"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
alertGroupNameLabel, "groupID",
"foo", "bar",
"namespace", "baz",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
)): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("rule labels", "source", "vm"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
alertNameLabel, "",
"foo", "bar",
"namespace", "baz",
// extra labels set by rule
"source", "vm",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t,
"foo", "bar",
"namespace", "baz",
"source", "vm",
)): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
},
},
{
newTestRuleWithLabels("multiple alerts"),
[]datasource.Metric{
metricWithValueAndLabels(t, float64(time.Now().Truncate(time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-1",
),
metricWithValueAndLabels(t, float64(time.Now().Truncate(2*time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-2",
),
metricWithValueAndLabels(t, float64(time.Now().Truncate(3*time.Hour).Unix()),
"__name__", alertForStateMetricName,
"host", "localhost-3",
),
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "host", "localhost-1")): {State: notifier.StatePending,
Start: time.Now().Truncate(time.Hour)},
hash(metricWithLabels(t, "host", "localhost-2")): {State: notifier.StatePending,
Start: time.Now().Truncate(2 * time.Hour)},
hash(metricWithLabels(t, "host", "localhost-3")): {State: notifier.StatePending,
Start: time.Now().Truncate(3 * time.Hour)},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
t.Fatalf("unexpected err: %s", err)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(tc.expAlerts), len(tc.rule.alerts))
}
for key, exp := range tc.expAlerts {
got, ok := tc.rule.alerts[key]
if !ok {
t.Fatalf("expected to have key %d", key)
}
if got.State != exp.State {
t.Fatalf("expected state %d; got %d", exp.State, got.State)
}
if got.Start != exp.Start {
t.Fatalf("expected Start %v; got %v", exp.Start, got.Start)
}
}
})
}
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &fakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
// successful attempt
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
_, err := ar.Exec(context.TODO(), fq, false)
if err != nil {
t.Fatal(err)
}
// label `job` will collide with rule extra label and will make both time series equal
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err = ar.Exec(context.TODO(), fq, false)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.reset()
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err = ar.Exec(context.TODO(), fq, false)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
}
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
r := newTestAlertingRule(name, 0)
r.Labels = make(map[string]string)
for i := 0; i < len(labels); i += 2 {
r.Labels[labels[i]] = labels[i+1]
}
return r
}
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
return &AlertingRule{Name: name, alerts: make(map[uint64]*notifier.Alert), For: waitFor}
}

View File

@@ -0,0 +1,276 @@
package config
import (
"crypto/md5"
"fmt"
"hash/fnv"
"io/ioutil"
"path/filepath"
"sort"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metricsql"
"gopkg.in/yaml.v2"
)
// Group contains list of Rules grouped into
// entity with one name and evaluation interval
type Group struct {
File string
Name string `yaml:"name"`
Interval time.Duration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
// Checksum stores the hash of yaml definition for this group.
// May be used to detect any changes like rules re-ordering etc.
Checksum string
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (g *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
type group Group
if err := unmarshal((*group)(g)); err != nil {
return err
}
b, err := yaml.Marshal(g)
if err != nil {
return fmt.Errorf("failed to marshal group configuration for checksum: %w", err)
}
h := md5.New()
h.Write(b)
g.Checksum = fmt.Sprintf("%x", h.Sum(nil))
return nil
}
// Validate check for internal Group or Rule configuration errors
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
if g.Name == "" {
return fmt.Errorf("group name must be set")
}
if len(g.Rules) == 0 {
return fmt.Errorf("group %q can't contain no rules", g.Name)
}
uniqueRules := map[uint64]struct{}{}
for _, r := range g.Rules {
ruleName := r.Record
if r.Alert != "" {
ruleName = r.Alert
}
if _, ok := uniqueRules[r.ID]; ok {
return fmt.Errorf("rule %q duplicate", ruleName)
}
uniqueRules[r.ID] = struct{}{}
if err := r.Validate(); err != nil {
return fmt.Errorf("invalid rule %q.%q: %w", g.Name, ruleName, err)
}
if validateExpressions {
if _, err := metricsql.Parse(r.Expr); err != nil {
return fmt.Errorf("invalid expression for rule %q.%q: %w", g.Name, ruleName, err)
}
}
if validateAnnotations {
if err := notifier.ValidateTemplates(r.Annotations); err != nil {
return fmt.Errorf("invalid annotations for rule %q.%q: %w", g.Name, ruleName, err)
}
if err := notifier.ValidateTemplates(r.Labels); err != nil {
return fmt.Errorf("invalid labels for rule %q.%q: %w", g.Name, ruleName, err)
}
}
}
return checkOverflow(g.XXX, fmt.Sprintf("group %q", g.Name))
}
// Rule describes entity that represent either
// recording rule or alerting rule.
type Rule struct {
ID uint64
Record string `yaml:"record,omitempty"`
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For PromDuration `yaml:"for"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// PromDuration is Prometheus duration.
type PromDuration struct {
milliseconds int64
}
// NewPromDuration returns PromDuration for given d.
func NewPromDuration(d time.Duration) PromDuration {
return PromDuration{
milliseconds: d.Milliseconds(),
}
}
// MarshalYAML implements yaml.Marshaler interface.
func (pd PromDuration) MarshalYAML() (interface{}, error) {
return pd.Duration().String(), nil
}
// UnmarshalYAML implements yaml.Unmarshaler interface.
func (pd *PromDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var s string
if err := unmarshal(&s); err != nil {
return err
}
ms, err := metricsql.DurationValue(s, 0)
if err != nil {
return err
}
pd.milliseconds = ms
return nil
}
// Duration returns duration for pd.
func (pd *PromDuration) Duration() time.Duration {
return time.Duration(pd.milliseconds) * time.Millisecond
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rule Rule
if err := unmarshal((*rule)(r)); err != nil {
return err
}
r.ID = HashRule(*r)
return nil
}
// Name returns Rule name according to its type
func (r *Rule) Name() string {
if r.Record != "" {
return r.Record
}
return r.Alert
}
// HashRule hashes significant Rule fields into
// unique hash that supposed to define Rule uniqueness
func HashRule(r Rule) uint64 {
h := fnv.New64a()
h.Write([]byte(r.Expr))
if r.Record != "" {
h.Write([]byte("recording"))
h.Write([]byte(r.Record))
} else {
h.Write([]byte("alerting"))
h.Write([]byte(r.Alert))
}
kv := sortMap(r.Labels)
for _, i := range kv {
h.Write([]byte(i.key))
h.Write([]byte(i.value))
h.Write([]byte("\xff"))
}
return h.Sum64()
}
// Validate check for Rule configuration errors
func (r *Rule) Validate() error {
if (r.Record == "" && r.Alert == "") || (r.Record != "" && r.Alert != "") {
return fmt.Errorf("either `record` or `alert` must be set")
}
if r.Expr == "" {
return fmt.Errorf("expression can't be empty")
}
return checkOverflow(r.XXX, "rule")
}
// Parse parses rule configs from given file patterns
func Parse(pathPatterns []string, validateAnnotations, validateExpressions bool) ([]Group, error) {
var fp []string
for _, pattern := range pathPatterns {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("error reading file pattern %s: %w", pattern, err)
}
fp = append(fp, matches...)
}
errGroup := new(utils.ErrGroup)
var groups []Group
for _, file := range fp {
uniqueGroups := map[string]struct{}{}
gr, err := parseFile(file)
if err != nil {
errGroup.Add(fmt.Errorf("failed to parse file %q: %w", file, err))
continue
}
for _, g := range gr {
if err := g.Validate(validateAnnotations, validateExpressions); err != nil {
errGroup.Add(fmt.Errorf("invalid group %q in file %q: %w", g.Name, file, err))
continue
}
if _, ok := uniqueGroups[g.Name]; ok {
errGroup.Add(fmt.Errorf("group name %q duplicate in file %q", g.Name, file))
continue
}
uniqueGroups[g.Name] = struct{}{}
g.File = file
groups = append(groups, g)
}
}
if err := errGroup.Err(); err != nil {
return nil, err
}
if len(groups) < 1 {
logger.Warnf("no groups found in %s", strings.Join(pathPatterns, ";"))
}
return groups, nil
}
func parseFile(path string) ([]Group, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("error reading alert rule file: %w", err)
}
data = envtemplate.Replace(data)
g := struct {
Groups []Group `yaml:"groups"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}{}
err = yaml.Unmarshal(data, &g)
if err != nil {
return nil, err
}
return g.Groups, checkOverflow(g.XXX, "config")
}
func checkOverflow(m map[string]interface{}, ctx string) error {
if len(m) > 0 {
var keys []string
for k := range m {
keys = append(keys, k)
}
return fmt.Errorf("unknown fields in %s: %s", ctx, strings.Join(keys, ", "))
}
return nil
}
type item struct {
key, value string
}
func sortMap(m map[string]string) []item {
var kv []item
for k, v := range m {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
return kv
}

View File

@@ -0,0 +1,380 @@
package config
import (
"net/url"
"os"
"strings"
"testing"
"time"
"gopkg.in/yaml.v2"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
func TestParseGood(t *testing.T) {
if _, err := Parse([]string{"testdata/*good.rules", "testdata/dir/*good.*"}, true, true); err != nil {
t.Errorf("error parsing files %s", err)
}
}
func TestParseBad(t *testing.T) {
testCases := []struct {
path []string
expErr string
}{
{
[]string{"testdata/rules0-bad.rules"},
"unexpected token",
},
{
[]string{"testdata/dir/rules0-bad.rules"},
"error parsing annotation",
},
{
[]string{"testdata/dir/rules1-bad.rules"},
"duplicate in file",
},
{
[]string{"testdata/dir/rules2-bad.rules"},
"function \"unknown\" not defined",
},
{
[]string{"testdata/dir/rules3-bad.rules"},
"either `record` or `alert` must be set",
},
{
[]string{"testdata/dir/rules4-bad.rules"},
"either `record` or `alert` must be set",
},
}
for _, tc := range testCases {
_, err := Parse(tc.path, true, true)
if err == nil {
t.Errorf("expected to get error")
return
}
if !strings.Contains(err.Error(), tc.expErr) {
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
}
}
}
func TestRule_Validate(t *testing.T) {
if err := (&Rule{}).Validate(); err == nil {
t.Errorf("expected empty name error")
}
if err := (&Rule{Alert: "alert"}).Validate(); err == nil {
t.Errorf("expected empty expr error")
}
if err := (&Rule{Alert: "alert", Expr: "test>0"}).Validate(); err != nil {
t.Errorf("expected valid rule; got %s", err)
}
}
func TestGroup_Validate(t *testing.T) {
testCases := []struct {
group *Group
rules []Rule
validateAnnotations bool
validateExpressions bool
expErr string
}{
{
group: &Group{},
expErr: "group name must be set",
},
{
group: &Group{Name: "test"},
expErr: "contain no rules",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Record: "record",
Expr: "up | 0",
},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Record: "record",
Expr: "up | 0",
},
},
},
expErr: "invalid expression",
validateExpressions: true,
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
Labels: map[string]string{
"summary": "{{ value|query }}",
},
},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
Labels: map[string]string{
"summary": `
{{ with printf "node_memory_MemTotal{job='node',instance='%s'}" "localhost" | query }}
{{ . | first | value | humanize1024 }}B
{{ end }}`,
},
},
},
},
validateAnnotations: true,
},
{
group: &Group{Name: "test",
Rules: []Rule{
{
Alert: "alert",
Expr: "up == 1",
},
{
Alert: "alert",
Expr: "up == 1",
},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Record: "record", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Record: "record", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "duplicate",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"description": "{{ value|query }}",
}},
},
},
expErr: "",
},
{
group: &Group{Name: "test",
Rules: []Rule{
{Record: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"summary": "{{ value|query }}",
}},
},
},
expErr: "",
},
}
for _, tc := range testCases {
err := tc.group.Validate(tc.validateAnnotations, tc.validateExpressions)
if err == nil {
if tc.expErr != "" {
t.Errorf("expected to get err %q; got nil insted", tc.expErr)
}
continue
}
if !strings.Contains(err.Error(), tc.expErr) {
t.Errorf("expected err to contain %q; got %q instead", tc.expErr, err)
}
}
}
func TestHashRule(t *testing.T) {
testCases := []struct {
a, b Rule
equal bool
}{
{
Rule{Record: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1"},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
"foo": "bar",
}},
true,
},
{
Rule{Alert: "record", Expr: "up == 1"},
Rule{Alert: "record", Expr: "up == 1"},
true,
},
{
Rule{Alert: "alert", Expr: "up == 1", For: NewPromDuration(time.Minute)},
Rule{Alert: "alert", Expr: "up == 1"},
true,
},
{
Rule{Alert: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 1"},
false,
},
{
Rule{Record: "record", Expr: "up == 1"},
Rule{Record: "record", Expr: "up == 2"},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
"foo": "baz",
}},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"baz": "foo",
}},
false,
},
{
Rule{Alert: "alert", Expr: "up == 1", Labels: map[string]string{
"foo": "bar",
"baz": "foo",
}},
Rule{Alert: "alert", Expr: "up == 1"},
false,
},
}
for i, tc := range testCases {
aID, bID := HashRule(tc.a), HashRule(tc.b)
if tc.equal != (aID == bID) {
t.Fatalf("missmatch for rule %d", i)
}
}
}
func TestGroupChecksum(t *testing.T) {
f := func(t *testing.T, data, newData string) {
t.Helper()
var g Group
if err := yaml.Unmarshal([]byte(data), &g); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == "" {
t.Fatalf("expected to get non-empty checksum")
}
var ng Group
if err := yaml.Unmarshal([]byte(newData), &ng); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == ng.Checksum {
t.Fatalf("expected to get different checksums")
}
}
t.Run("Ok", func(t *testing.T) {
f(t, `
name: TestGroup
rules:
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
`, `
name: TestGroup
rules:
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
`)
})
t.Run("Ok, `for` must change cs", func(t *testing.T) {
f(t, `
name: TestGroup
rules:
- alert: ExampleAlertWithFor
expr: sum by(job) (up == 1)
for: 5m
`, `
name: TestGroup
rules:
- alert: ExampleAlertWithFor
expr: sum by(job) (up == 1)
`)
})
}

View File

@@ -0,0 +1,19 @@
groups:
- name: group
rules:
- alert: InvalidAnnotations
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }"
description: "{{$labels}}"
- alert: UnkownAnnotationsFunction
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ value|query }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,14 @@
groups:
- name: duplicatedGroupDiffFiles
rules:
- alert: VMRows
for: 5m
expr: vm_rows > 0
labels:
label: bar
expr: "{{ $expr|queryEscape }}"
annotations:
summary: "{{ $value|humanize }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,22 @@
groups:
- name: sameGroup
rules:
- alert: alert
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- name: sameGroup
rules:
- alert: alert
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,11 @@
groups:
- name: duplicatedGroupDiffFiles
rules:
- alert: VMRows
for: 5m
expr: vm_rows > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,11 @@
groups:
- name: group
rules:
- alert: UnkownLabelFunction
for: 5m
expr: vm_rows > 0
labels:
label: bar
summary: "{{ unknown|query }}"
annotations:
description: "{{$labels}}"

View File

@@ -0,0 +1,5 @@
groups:
- name: group
rules:
- for: 5m
expr: vm_rows > 0

View File

@@ -0,0 +1,7 @@
groups:
- name: group
rules:
- alert: rows
record: record
for: 5m
expr: vm_rows > 0

View File

@@ -0,0 +1,7 @@
groups:
- name: group
rules:
- alert: rows
expr: vm_rows > 0
- record: rows
expr: sum(vm_rows)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
groups:
- name: group
rules:
- alert: InvalidExpr
for: 5m
expr: vm_rows{ > 0
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- alert: EmptyExpr
for: 5m
expr: ""
labels:
label: bar
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"
- alert: ""
for: 5m
expr: vm_rows > 0
labels:
label: foo
annotations:
summary: "{{ $value }}"
description: "{{$labels}}"

View File

@@ -0,0 +1,23 @@
groups:
- name: groupGorSingleAlert
rules:
- alert: VMRows
for: 10s
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value|humanize }}"
description: "{{$labels}}"
- name: TestGroup
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1
annotations:
summary: "Too high connection number for {{$labels.instance}}"
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: ExampleAlertAlwaysFiring
expr: sum by(job)
(up == 1)

View File

@@ -0,0 +1,11 @@
groups:
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"

View File

@@ -0,0 +1,42 @@
groups:
- name: TestGroup
interval: 2s
concurrency: 2
rules:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1
for: 3m
annotations:
summary: Too high connection number for {{$labels.instance}}
{{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }}
{{ . | first | value }}
{{ end }}
description: "It is {{ $value }} connections for {{$labels.instance}}"
- alert: ExampleAlertAlwaysFiring
expr: sum by(job)
(up == 1)
annotations:
summary: Instances up {{ range query "up" }}
{{ . | label "instance" }}
{{ end }}
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
labels:
recording: true
- record: code:requests:rate5m
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
labels:
env: dev
recording: true
- record: code:requests:rate5m
expr: sum(rate(promhttp_metric_handler_requests_total[5m])) by (code)
labels:
env: staging
recording: true
- record: successful_requests:ratio_rate5m
labels:
recording: true
expr: |2
sum(code:requests:rate5m{code="200"})
/
sum(code:requests:rate5m)

View File

@@ -0,0 +1,52 @@
package datasource
import "context"
// Querier interface wraps Query method which
// executes given query and returns list of Metrics
// as result
type Querier interface {
Query(ctx context.Context, query string) ([]Metric, error)
}
// Metric is the basic entity which should be return by datasource
// It represents single data point with full list of labels
type Metric struct {
Labels []Label
Timestamp int64
Value float64
}
// SetLabel adds or updates existing one label
// by the given key and label
func (m *Metric) SetLabel(key, value string) {
for i, l := range m.Labels {
if l.Name == key {
m.Labels[i].Value = value
return
}
}
m.AddLabel(key, value)
}
// AddLabel appends the given label to the label set
func (m *Metric) AddLabel(key, value string) {
m.Labels = append(m.Labels, Label{Name: key, Value: value})
}
// Label returns the given label value.
// If label is missing empty string will be returned
func (m *Metric) Label(key string) string {
for _, l := range m.Labels {
if l.Name == key {
return l.Value
}
}
return ""
}
// Label represents metric's label
type Label struct {
Name string
Value string
}

View File

@@ -0,0 +1,43 @@
package datasource
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("datasource.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -datasource.url")
tlsCertFile = flag.String("datasource.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -datasource.url")
tlsKeyFile = flag.String("datasource.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -datasource.url")
tlsCAFile = flag.String("datasource.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -datasource.url. "+
"By default system CA is used")
tlsServerName = flag.String("datasource.tlsServerName", "", "Optional TLS server name to use for connections to -datasource.url. "+
"By default the server name from -datasource.url is used")
lookBack = flag.Duration("datasource.lookback", 0, "Lookback defines how far to look into past when evaluating queries. "+
"For example, if datasource.lookback=5m then param \"time\" with value now()-5m will be added to every query.")
maxIdleConnections = flag.Int("datasource.maxIdleConnections", 100, "Defines the number of idle (keep-alive connections) to configured datasource."+
"Consider to set this value equal to the value: groups_total * group.concurrency. Too low value may result into high number of sockets in TIME_WAIT state.")
)
// Init creates a Querier from provided flag values.
func Init() (Querier, error) {
if *addr == "" {
return nil, fmt.Errorf("datasource.url is empty")
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
tr.MaxIdleConns = *maxIdleConnections
c := &http.Client{Transport: tr}
return NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, *lookBack, c), nil
}

View File

@@ -0,0 +1,112 @@
package datasource
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
type response struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result []struct {
Labels map[string]string `json:"metric"`
TV [2]interface{} `json:"value"`
} `json:"result"`
} `json:"data"`
ErrorType string `json:"errorType"`
Error string `json:"error"`
}
func (r response) metrics() ([]Metric, error) {
var ms []Metric
var m Metric
var f float64
var err error
for i, res := range r.Data.Result {
f, err = strconv.ParseFloat(res.TV[1].(string), 64)
if err != nil {
return nil, fmt.Errorf("metric %v, unable to parse float64 from %s: %w", res, res.TV[1], err)
}
m.Labels = nil
for k, v := range r.Data.Result[i].Labels {
m.AddLabel(k, v)
}
m.Timestamp = int64(res.TV[0].(float64))
m.Value = f
ms = append(ms, m)
}
return ms, nil
}
// VMStorage represents vmstorage entity with ability to read and write metrics
type VMStorage struct {
c *http.Client
queryURL string
basicAuthUser string
basicAuthPass string
lookBack time.Duration
}
const queryPath = "/api/v1/query?query="
// NewVMStorage is a constructor for VMStorage
func NewVMStorage(baseURL, basicAuthUser, basicAuthPass string, lookBack time.Duration, c *http.Client) *VMStorage {
return &VMStorage{
c: c,
basicAuthUser: basicAuthUser,
basicAuthPass: basicAuthPass,
queryURL: strings.TrimSuffix(baseURL, "/") + queryPath,
lookBack: lookBack,
}
}
// Query reads metrics from datasource by given query
func (s *VMStorage) Query(ctx context.Context, query string) ([]Metric, error) {
const (
statusSuccess, statusError, rtVector = "success", "error", "vector"
)
q := s.queryURL + url.QueryEscape(query)
if s.lookBack > 0 {
lookBack := time.Now().Add(-s.lookBack)
q += fmt.Sprintf("&time=%d", lookBack.Unix())
}
req, err := http.NewRequest("POST", q, nil)
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json; charset=utf-8")
if s.basicAuthPass != "" {
req.SetBasicAuth(s.basicAuthUser, s.basicAuthPass)
}
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, _ := ioutil.ReadAll(resp.Body)
return nil, fmt.Errorf("datasource returns unexpected response code %d for %s. Response body %s", resp.StatusCode, req.URL, body)
}
r := &response{}
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
return nil, fmt.Errorf("error parsing metrics for %s: %w", req.URL, err)
}
if r.Status == statusError {
return nil, fmt.Errorf("response error, query: %s, errorType: %s, error: %s", req.URL, r.ErrorType, r.Error)
}
if r.Status != statusSuccess {
return nil, fmt.Errorf("unknown status: %s, Expected success or error ", r.Status)
}
if r.Data.ResultType != rtVector {
return nil, fmt.Errorf("unknown result type:%s. Expected vector", r.Data.ResultType)
}
return r.metrics()
}

View File

@@ -0,0 +1,101 @@
package datasource
import (
"context"
"net/http"
"net/http/httptest"
"strconv"
"testing"
"time"
)
var (
ctx = context.Background()
basicAuthName = "foo"
basicAuthPass = "bar"
query = "vm_rows"
)
func TestVMSelectQuery(t *testing.T) {
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
}
if name, pass, _ := r.BasicAuth(); name != basicAuthName || pass != basicAuthPass {
t.Errorf("expected %s:%s as basic auth got %s:%s", basicAuthName, basicAuthPass, name, pass)
}
if r.URL.Query().Get("query") != query {
t.Errorf("expected %s in query param, got %s", query, r.URL.Query().Get("query"))
}
timeParam := r.URL.Query().Get("time")
if timeParam == "" {
t.Errorf("expected 'time' in query param, got nil instead")
}
if _, err := strconv.ParseInt(timeParam, 10, 64); err != nil {
t.Errorf("failed to parse 'time' query param: %s", err)
}
switch c {
case 0:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 1:
w.WriteHeader(500)
case 2:
w.Write([]byte("[]"))
case 3:
w.Write([]byte(`{"status":"error", "errorType":"type:", "error":"some error msg"}`))
case 4:
w.Write([]byte(`{"status":"unknown"}`))
case 5:
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
case 6:
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows"},"value":[1583786142,"13763"]}]}}`))
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewVMStorage(srv.URL, basicAuthName, basicAuthPass, time.Minute, srv.Client())
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected connection error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected invalid response status error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected response body error got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected error status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected unknown status got nil")
}
if _, err := am.Query(ctx, query); err == nil {
t.Fatalf("expected non-vector resultType error got nil")
}
m, err := am.Query(ctx, query)
if err != nil {
t.Fatalf("unexpected %s", err)
}
if len(m) != 1 {
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
}
expected := Metric{
Labels: []Label{{Value: "vm_rows", Name: "__name__"}},
Timestamp: 1583786142,
Value: 13763,
}
if m[0].Timestamp != expected.Timestamp &&
m[0].Value != expected.Value &&
m[0].Labels[0].Value != expected.Labels[0].Value &&
m[0].Labels[0].Name != expected.Labels[0].Name {
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
}
}

View File

@@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8880
ENTRYPOINT ["/vmalert-prod"]
ARG src_binary
COPY $src_binary ./vmalert-prod

357
app/vmalert/group.go Normal file
View File

@@ -0,0 +1,357 @@
package main
import (
"context"
"fmt"
"hash/fnv"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
)
// Group is an entity for grouping rules
type Group struct {
mu sync.RWMutex
Name string
File string
Rules []Rule
Interval time.Duration
Concurrency int
Checksum string
doneCh chan struct{}
finishedCh chan struct{}
// channel accepts new Group obj
// which supposed to update current group
updateCh chan *Group
metrics *groupMetrics
}
type groupMetrics struct {
iterationTotal *counter
iterationDuration *summary
}
func newGroupMetrics(name, file string) *groupMetrics {
m := &groupMetrics{}
labels := fmt.Sprintf(`group=%q, file=%q`, name, file)
m.iterationTotal = getOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels))
m.iterationDuration = getOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels))
return m
}
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Name: cfg.Name,
File: cfg.File,
Interval: cfg.Interval,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
}
g.metrics = newGroupMetrics(g.Name, g.File)
if g.Interval == 0 {
g.Interval = defaultInterval
}
if g.Concurrency < 1 {
g.Concurrency = 1
}
rules := make([]Rule, len(cfg.Rules))
for i, r := range cfg.Rules {
// override rule labels with external labels
for k, v := range labels {
if prevV, ok := r.Labels[k]; ok {
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
k, prevV, g.Name, r.Name(), k, v)
}
if r.Labels == nil {
r.Labels = map[string]string{}
}
r.Labels[k] = v
}
rules[i] = g.newRule(r)
}
g.Rules = rules
return g
}
func (g *Group) newRule(rule config.Rule) Rule {
if rule.Alert != "" {
return newAlertingRule(g, rule)
}
return newRecordingRule(g, rule)
}
// ID return unique group ID that consists of
// rules file and group name
func (g *Group) ID() uint64 {
hash := fnv.New64a()
hash.Write([]byte(g.File))
hash.Write([]byte("\xff"))
hash.Write([]byte(g.Name))
return hash.Sum64()
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
for _, rule := range g.Rules {
rr, ok := rule.(*AlertingRule)
if !ok {
continue
}
if rr.For < 1 {
continue
}
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}
return nil
}
// updateWith updates existing group with
// passed group object. This function ignores group
// evaluation interval change. It supposed to be updated
// in group.start function.
// Not thread-safe.
func (g *Group) updateWith(newGroup *Group) error {
rulesRegistry := make(map[uint64]Rule)
for _, nr := range newGroup.Rules {
rulesRegistry[nr.ID()] = nr
}
for i, or := range g.Rules {
nr, ok := rulesRegistry[or.ID()]
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i].Close()
g.Rules[i] = nil
continue
}
if err := or.UpdateWith(nr); err != nil {
return err
}
delete(rulesRegistry, nr.ID())
}
var newRules []Rule
for _, r := range g.Rules {
if r == nil {
// skip nil rules
continue
}
newRules = append(newRules, r)
}
// add the rest of rules from registry
for _, nr := range rulesRegistry {
newRules = append(newRules, nr)
}
g.Concurrency = newGroup.Concurrency
g.Checksum = newGroup.Checksum
g.Rules = newRules
return nil
}
var (
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
)
func (g *Group) close() {
if g.doneCh == nil {
return
}
close(g.doneCh)
<-g.finishedCh
metrics.UnregisterMetric(g.metrics.iterationDuration.name)
metrics.UnregisterMetric(g.metrics.iterationTotal.name)
for _, rule := range g.Rules {
rule.Close()
}
}
var skipRandSleepOnGroupStart bool
func (g *Group) start(ctx context.Context, querier datasource.Querier, nts []notifier.Notifier, rw *remotewrite.Client) {
defer func() { close(g.finishedCh) }()
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
if !skipRandSleepOnGroupStart {
randSleep := uint64(float64(g.Interval) * (float64(uint32(g.ID())) / (1 << 32)))
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
if randSleep < sleepOffset {
randSleep += uint64(g.Interval)
}
randSleep -= sleepOffset
sleepTimer := time.NewTimer(time.Duration(randSleep))
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case <-sleepTimer.C:
}
}
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
e := &executor{querier, nts, rw}
t := time.NewTicker(g.Interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
logger.Infof("group %q: context cancelled", g.Name)
return
case <-g.doneCh:
logger.Infof("group %q: received stop signal", g.Name)
return
case ng := <-g.updateCh:
g.mu.Lock()
err := g.updateWith(ng)
if err != nil {
logger.Errorf("group %q: failed to update: %s", g.Name, err)
g.mu.Unlock()
continue
}
if g.Interval != ng.Interval {
g.Interval = ng.Interval
t.Stop()
t = time.NewTicker(g.Interval)
}
g.mu.Unlock()
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
case <-t.C:
g.metrics.iterationTotal.Inc()
iterationStart := time.Now()
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval)
for err := range errs {
if err != nil {
logger.Errorf("group %q: %s", g.Name, err)
}
}
g.metrics.iterationDuration.UpdateDuration(iterationStart)
}
}
}
type executor struct {
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
}
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error {
res := make(chan error, len(rules))
var returnSeries bool
if e.rw != nil {
returnSeries = true
}
if concurrency == 1 {
// fast path
for _, rule := range rules {
res <- e.exec(ctx, rule, returnSeries, interval)
}
close(res)
return res
}
sem := make(chan struct{}, concurrency)
go func() {
wg := sync.WaitGroup{}
for _, rule := range rules {
sem <- struct{}{}
wg.Add(1)
go func(r Rule) {
res <- e.exec(ctx, r, returnSeries, interval)
<-sem
wg.Done()
}(rule)
}
wg.Wait()
close(res)
}()
return res
}
var (
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (e *executor) exec(ctx context.Context, rule Rule, returnSeries bool, interval time.Duration) error {
execTotal.Inc()
execStart := time.Now()
defer func() {
execDuration.UpdateDuration(execStart)
}()
tss, err := rule.Exec(ctx, e.querier, returnSeries)
if err != nil {
execErrors.Inc()
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
}
if len(tss) > 0 && e.rw != nil {
for _, ts := range tss {
if err := e.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
}
}
}
ar, ok := rule.(*AlertingRule)
if !ok {
return nil
}
var alerts []notifier.Alert
for _, a := range ar.alerts {
switch a.State {
case notifier.StateFiring:
// set End to execStart + 3 intervals
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = time.Now().Add(3 * interval)
alerts = append(alerts, *a)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = time.Now()
alerts = append(alerts, *a)
}
}
if len(alerts) < 1 {
return nil
}
alertsSent.Add(len(alerts))
errGr := new(utils.ErrGroup)
for _, nt := range e.notifiers {
if err := nt.Send(ctx, alerts); err != nil {
alertsSendErrors.Inc()
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
}
}
return errGr.Err()
}

223
app/vmalert/group_test.go Normal file
View File

@@ -0,0 +1,223 @@
package main
import (
"context"
"sort"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func init() {
// Disable rand sleep on group start during tests in order to speed up test execution.
// Rand sleep is needed only in prod code.
skipRandSleepOnGroupStart = true
}
func TestUpdateWith(t *testing.T) {
testCases := []struct {
name string
currentRules []config.Rule
newRules []config.Rule
}{
{
"new rule",
nil,
[]config.Rule{{Alert: "bar"}},
},
{
"update alerting rule",
[]config.Rule{{
Alert: "foo",
Expr: "up > 0",
For: config.NewPromDuration(time.Second),
Labels: map[string]string{
"bar": "baz",
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
}},
[]config.Rule{{
Alert: "foo",
Expr: "up > 10",
For: config.NewPromDuration(time.Second),
Labels: map[string]string{
"baz": "bar",
},
Annotations: map[string]string{
"summary": "none",
},
}},
},
{
"update recording rule",
[]config.Rule{{
Record: "foo",
Expr: "max(up)",
Labels: map[string]string{
"bar": "baz",
},
}},
[]config.Rule{{
Record: "foo",
Expr: "min(up)",
Labels: map[string]string{
"baz": "bar",
},
}},
},
{
"empty rule",
[]config.Rule{{Alert: "foo"}, {Record: "bar"}},
nil,
},
{
"multiple rules",
[]config.Rule{
{Alert: "bar"},
{Alert: "baz"},
{Alert: "foo"},
},
[]config.Rule{
{Alert: "baz"},
{Record: "foo"},
},
},
{
"replace rule",
[]config.Rule{{Alert: "foo1"}},
[]config.Rule{{Alert: "foo2"}},
},
{
"replace multiple rules",
[]config.Rule{
{Alert: "foo1"},
{Record: "foo2"},
{Alert: "foo3"},
},
[]config.Rule{
{Alert: "foo3"},
{Alert: "foo4"},
{Record: "foo5"},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Name: "test"}
for _, r := range tc.currentRules {
r.ID = config.HashRule(r)
g.Rules = append(g.Rules, g.newRule(r))
}
ng := &Group{Name: "test"}
for _, r := range tc.newRules {
r.ID = config.HashRule(r)
ng.Rules = append(ng.Rules, ng.newRule(r))
}
err := g.updateWith(ng)
if err != nil {
t.Fatal(err)
}
if len(g.Rules) != len(tc.newRules) {
t.Fatalf("expected to have %d rules; got: %d",
len(g.Rules), len(tc.newRules))
}
sort.Slice(g.Rules, func(i, j int) bool {
return g.Rules[i].ID() < g.Rules[j].ID()
})
sort.Slice(ng.Rules, func(i, j int) bool {
return ng.Rules[i].ID() < ng.Rules[j].ID()
})
for i, r := range g.Rules {
got, want := r, ng.Rules[i]
if got.ID() != want.ID() {
t.Fatalf("expected to have rule %q; got %q", want, got)
}
if err := compareRules(t, got, want); err != nil {
t.Fatalf("comparsion error: %s", err)
}
}
})
}
}
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := config.Parse([]string{"config/testdata/rules1-good.rules"}, true, true)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
const evalInterval = time.Millisecond
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
fn := &fakeNotifier{}
fs := &fakeQuerier{}
const inst1, inst2, job = "foo", "bar", "baz"
m1 := metricWithLabels(t, "instance", inst1, "job", job)
m2 := metricWithLabels(t, "instance", inst2, "job", job)
r := g.Rules[0].(*AlertingRule)
alert1, err := r.newAlert(m1, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert1.State = notifier.StateFiring
// add external label
alert1.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
alert1.Labels["label"] = "bar"
alert1.Labels["host"] = inst1
alert1.ID = hash(m1)
alert2, err := r.newAlert(m2, time.Now(), nil)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert2.State = notifier.StateFiring
// add external label
alert2.Labels["cluster"] = "east-1"
// add rule labels - see config/testdata/rules1-good.rules
alert2.Labels["label"] = "bar"
alert2.Labels["host"] = inst2
alert2.ID = hash(m2)
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
go func() {
g.start(context.Background(), fs, []notifier.Notifier{fn}, nil)
close(finished)
}()
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
// reset previous data
fs.reset()
// and set only one datapoint for response
fs.add(m1)
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts()
expectedAlerts = []notifier.Alert{*alert1}
compareAlerts(t, expectedAlerts, gotAlerts)
g.close()
<-finished
}

232
app/vmalert/helpers_test.go Normal file
View File

@@ -0,0 +1,232 @@
package main
import (
"context"
"fmt"
"reflect"
"sort"
"sync"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
err error
}
func (fq *fakeQuerier) setErr(err error) {
fq.Lock()
fq.err = err
fq.Unlock()
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.err = nil
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
fq.Lock()
defer fq.Unlock()
if fq.err != nil {
return nil, fq.err
}
cp := make([]datasource.Metric, len(fq.metrics))
copy(cp, fq.metrics)
return cp, nil
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
}
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}
func metricWithValueAndLabels(t *testing.T, value float64, labels ...string) datasource.Metric {
t.Helper()
m := metricWithLabels(t, labels...)
m.Value = value
return m
}
func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
t.Helper()
if len(labels) == 0 || len(labels)%2 != 0 {
t.Fatalf("expected to get even number of labels")
}
m := datasource.Metric{}
for i := 0; i < len(labels); i += 2 {
m.Labels = append(m.Labels, datasource.Label{
Name: labels[i],
Value: labels[i+1],
})
}
return m
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if a.Name != b.Name {
t.Fatalf("expected group name %q; got %q", a.Name, b.Name)
}
if a.File != b.File {
t.Fatalf("expected group %q file name %q; got %q", a.Name, a.File, b.File)
}
if a.Interval != b.Interval {
t.Fatalf("expected group %q interval %v; got %v", a.Name, a.Interval, b.Interval)
}
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if a.ID() != b.ID() {
t.Fatalf("expected to have rule %q; got %q", want.ID(), got.ID())
}
if err := compareRules(t, want, got); err != nil {
t.Fatalf("comparsion error: %s", err)
}
}
}
func compareRules(t *testing.T, a, b Rule) error {
t.Helper()
switch v := a.(type) {
case *AlertingRule:
br, ok := b.(*AlertingRule)
if !ok {
return fmt.Errorf("rule %q supposed to be of type AlertingRule", b.ID())
}
return compareAlertingRules(t, v, br)
case *RecordingRule:
br, ok := b.(*RecordingRule)
if !ok {
return fmt.Errorf("rule %q supposed to be of type RecordingRule", b.ID())
}
return compareRecordingRules(t, v, br)
default:
return fmt.Errorf("unexpected rule type received %T", a)
}
}
func compareRecordingRules(t *testing.T, a, b *RecordingRule) error {
t.Helper()
if a.Expr != b.Expr {
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
return nil
}
func compareAlertingRules(t *testing.T, a, b *AlertingRule) error {
t.Helper()
if a.Expr != b.Expr {
return fmt.Errorf("expected to have expression %q; got %q", a.Expr, b.Expr)
}
if a.For != b.For {
return fmt.Errorf("expected to have for %q; got %q", a.For, b.For)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
return fmt.Errorf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
return fmt.Errorf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
return nil
}
func compareTimeSeries(t *testing.T, a, b []prompbmarshal.TimeSeries) error {
t.Helper()
if len(a) != len(b) {
return fmt.Errorf("expected number of timeseries %d; got %d", len(a), len(b))
}
for i := range a {
expTS, gotTS := a[i], b[i]
if len(expTS.Samples) != len(gotTS.Samples) {
return fmt.Errorf("expected number of samples %d; got %d", len(expTS.Samples), len(gotTS.Samples))
}
for i, exp := range expTS.Samples {
got := gotTS.Samples[i]
if got.Value != exp.Value {
return fmt.Errorf("expected value %.2f; got %.2f", exp.Value, got.Value)
}
// timestamp validation isn't always correct for now.
// this must be improved with time mock.
/*if got.Timestamp != exp.Timestamp {
return fmt.Errorf("expected timestamp %d; got %d", exp.Timestamp, got.Timestamp)
}*/
}
if len(expTS.Labels) != len(gotTS.Labels) {
return fmt.Errorf("expected number of labels %d; got %d", len(expTS.Labels), len(gotTS.Labels))
}
for i, exp := range expTS.Labels {
got := gotTS.Labels[i]
if got.Name != exp.Name {
return fmt.Errorf("expected label name %q; got %q", exp.Name, got.Name)
}
if got.Value != exp.Value {
return fmt.Errorf("expected label value %q; got %q", exp.Value, got.Value)
}
}
}
return nil
}
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
t.Helper()
if len(as) != len(bs) {
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
}
sort.Slice(as, func(i, j int) bool {
return as[i].ID < as[j].ID
})
sort.Slice(bs, func(i, j int) bool {
return bs[i].ID < bs[j].ID
})
for i := range as {
a, b := as[i], bs[i]
if a.Name != b.Name {
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
}
if a.State != b.State {
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
}
if a.Value != b.Value {
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
}
}

224
app/vmalert/main.go Normal file
View File

@@ -0,0 +1,224 @@
package main
import (
"context"
"flag"
"fmt"
"net/url"
"os"
"strconv"
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
)
var (
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
Supports patterns. Flag can be specified multiple times.
Examples:
-rule="/path/to/file". Path to a single file with alerting rules
-rule="dir/*.yaml" -rule="/*.yaml". Relative path to all .yaml files in "dir" folder,
absolute path to all .yaml files in root.
Rule files may contain %{ENV_VAR} placeholders, which are substituted by the corresponding env vars.`)
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules")
validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates")
validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine")
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
if *dryRun {
u, _ := url.Parse("https://victoriametrics.com/")
notifier.InitTemplateFunc(u)
groups, err := config.Parse(*rulePath, true, true)
if err != nil {
logger.Fatalf(err.Error())
}
if len(groups) == 0 {
logger.Fatalf("No rules for validation. Please specify path to file(s) with alerting and/or recording rules using `-rule` flag")
}
return
}
ctx, cancel := context.WithCancel(context.Background())
manager, err := newManager(ctx)
if err != nil {
logger.Fatalf("failed to init: %s", err)
}
if err := manager.start(ctx, *rulePath, *validateTemplates, *validateExpressions); err != nil {
logger.Fatalf("failed to start: %s", err)
}
go func() {
// init reload metrics with positive values to improve alerting conditions
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
sigHup := procutil.NewSighupChan()
for {
<-sigHup
configReloads.Inc()
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
if err := manager.update(ctx, *rulePath, *validateTemplates, *validateExpressions, false); err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("error while reloading rules: %s", err)
continue
}
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath)
}
}()
rh := &requestHandler{m: manager}
go httpserver.Serve(*httpListenAddr, rh.handler)
sig := procutil.WaitForSigterm()
logger.Infof("service received signal %s", sig)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
cancel()
manager.close()
}
var (
configReloads = metrics.NewCounter(`vmalert_config_last_reload_total`)
configReloadErrors = metrics.NewCounter(`vmalert_config_last_reload_errors_total`)
configSuccess = metrics.NewCounter(`vmalert_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
)
func newManager(ctx context.Context) (*manager, error) {
q, err := datasource.Init()
if err != nil {
return nil, fmt.Errorf("failed to init datasource: %w", err)
}
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
return nil, fmt.Errorf("failed to init `external.url`: %w", err)
}
notifier.InitTemplateFunc(eu)
aug, err := getAlertURLGenerator(eu, *externalAlertSource, *validateTemplates)
if err != nil {
return nil, fmt.Errorf("failed to init `external.alert.source`: %w", err)
}
nts, err := notifier.Init(aug)
if err != nil {
return nil, fmt.Errorf("failed to init notifier: %w", err)
}
manager := &manager{
groups: make(map[uint64]*Group),
querier: q,
notifiers: nts,
labels: map[string]string{},
}
rw, err := remotewrite.Init(ctx)
if err != nil {
return nil, fmt.Errorf("failed to init remoteWrite: %w", err)
}
manager.rw = rw
rr, err := remoteread.Init()
if err != nil {
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
}
manager.rr = rr
for _, s := range *externalLabels {
if len(s) == 0 {
continue
}
n := strings.IndexByte(s, '=')
if n < 0 {
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
}
manager.labels[s[:n]] = s[n+1:]
}
return manager, nil
}
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
if externalURL != "" {
return url.Parse(externalURL)
}
hname, err := os.Hostname()
if err != nil {
return nil, err
}
port := ""
if ipport := strings.Split(httpListenAddr, ":"); len(ipport) > 1 {
port = ":" + ipport[1]
}
schema := "http://"
if isSecure {
schema = "https://"
}
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
}
func getAlertURLGenerator(externalURL *url.URL, externalAlertSource string, validateTemplate bool) (notifier.AlertURLGenerator, error) {
if externalAlertSource == "" {
return func(alert notifier.Alert) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", externalURL, strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10))
}, nil
}
if validateTemplate {
if err := notifier.ValidateTemplates(map[string]string{
"tpl": externalAlertSource,
}); err != nil {
return nil, fmt.Errorf("error validating source template %s: %w", externalAlertSource, err)
}
}
m := map[string]string{
"tpl": externalAlertSource,
}
return func(alert notifier.Alert) string {
templated, err := alert.ExecTemplate(nil, m)
if err != nil {
logger.Errorf("can not exec source template %s", err)
}
return fmt.Sprintf("%s/%s", externalURL, templated["tpl"])
}, nil
}
func usage() {
const s = `
vmalert processes alerts and recording rules.
See the docs at https://victoriametrics.github.io/vmalert.html .
`
flagutil.Usage(s)
}

53
app/vmalert/main_test.go Normal file
View File

@@ -0,0 +1,53 @@
package main
import (
"fmt"
"net/url"
"os"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestGetExternalURL(t *testing.T) {
expURL := "https://vicotriametrics.com/path"
u, err := getExternalURL(expURL, "", false)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if u.String() != expURL {
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
}
h, _ := os.Hostname()
expURL = fmt.Sprintf("https://%s:4242", h)
u, err = getExternalURL("", "0.0.0.0:4242", true)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if u.String() != expURL {
t.Errorf("unexpected url want %s, got %s", expURL, u.String())
}
}
func TestGetAlertURLGenerator(t *testing.T) {
testAlert := notifier.Alert{GroupID: 42, ID: 2, Value: 4}
u, _ := url.Parse("https://victoriametrics.com/path")
fn, err := getAlertURLGenerator(u, "", false)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if exp := "https://victoriametrics.com/path/api/v1/42/2/status"; exp != fn(testAlert) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
_, err = getAlertURLGenerator(nil, "foo?{{invalid}}", true)
if err == nil {
t.Errorf("expected tempalte validation error got nil")
}
fn, err = getAlertURLGenerator(u, "foo?query={{$value}}", true)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if exp := "https://victoriametrics.com/path/foo?query=4"; exp != fn(testAlert) {
t.Errorf("unexpected url want %s, got %s", exp, fn(testAlert))
}
}

158
app/vmalert/manager.go Normal file
View File

@@ -0,0 +1,158 @@
package main
import (
"context"
"fmt"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// manager controls group states
type manager struct {
querier datasource.Querier
notifiers []notifier.Notifier
rw *remotewrite.Client
rr datasource.Querier
wg sync.WaitGroup
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
}
// AlertAPI generates APIAlert object from alert by its ID(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return nil, fmt.Errorf("can't find group with id %q", gID)
}
for _, rule := range g.Rules {
ar, ok := rule.(*AlertingRule)
if !ok {
continue
}
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
return apiAlert, nil
}
}
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
}
func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error {
return m.update(ctx, path, validateTpl, validateExpr, true)
}
func (m *manager) close() {
if m.rw != nil {
err := m.rw.Close()
if err != nil {
logger.Fatalf("cannot stop the remotewrite: %s", err)
}
}
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
}
m.wg.Add(1)
id := group.ID()
go func() {
group.start(ctx, m.querier, m.notifiers, m.rw)
m.wg.Done()
}()
m.groups[id] = group
}
func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error {
logger.Infof("reading rules configuration file from %q", strings.Join(path, ";"))
groupsCfg, err := config.Parse(path, validateTpl, validateExpr)
if err != nil {
return fmt.Errorf("cannot parse configuration file: %w", err)
}
groupsRegistry := make(map[uint64]*Group)
for _, cfg := range groupsCfg {
ng := newGroup(cfg, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}
type updateItem struct {
old *Group
new *Group
}
var toUpdate []updateItem
m.groupsMu.Lock()
for _, og := range m.groups {
ng, ok := groupsRegistry[og.ID()]
if !ok {
// old group is not present in new list,
// so must be stopped and deleted
og.close()
delete(m.groups, og.ID())
og = nil
continue
}
delete(groupsRegistry, ng.ID())
if og.Checksum != ng.Checksum {
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
}
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
}
m.groupsMu.Unlock()
if len(toUpdate) > 0 {
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
wg.Done()
}(item.old, item.new)
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
File: g.File,
Interval: g.Interval.String(),
Concurrency: g.Concurrency,
}
for _, r := range g.Rules {
switch v := r.(type) {
case *AlertingRule:
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
case *RecordingRule:
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
}
}
return ag
}

230
app/vmalert/manager_test.go Normal file
View File

@@ -0,0 +1,230 @@
package main
import (
"context"
"math/rand"
"net/url"
"os"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
notifier.InitTemplateFunc(u)
os.Exit(m.Run())
}
// TestManagerEmptyRulesDir tests
// successful cases of
// starting with empty rules folder
func TestManagerEmptyRulesDir(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
path := []string{"foo/bar"}
err := m.update(context.Background(), path, true, true, false)
if err != nil {
t.Fatalf("expected to load succesfully with empty rules dir; got err instead: %v", err)
}
}
// TestManagerUpdateConcurrent supposed to test concurrent
// execution of configuration update.
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
querier: &fakeQuerier{},
notifiers: []notifier.Notifier{&fakeNotifier{}},
}
paths := []string{
"config/testdata/dir/rules0-good.rules",
"config/testdata/dir/rules0-bad.rules",
"config/testdata/dir/rules1-good.rules",
"config/testdata/dir/rules1-bad.rules",
"config/testdata/rules0-good.rules",
"config/testdata/rules1-good.rules",
"config/testdata/rules2-good.rules",
}
*evaluationInterval = time.Millisecond
if err := m.start(context.Background(), []string{paths[0]}, true, true); err != nil {
t.Fatalf("failed to start: %s", err)
}
const workers = 500
const iterations = 10
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
rnd := rand.Intn(len(paths))
path := []string{paths[rnd]}
_ = m.update(context.Background(), path, true, true, false)
}
}()
}
wg.Wait()
}
// TestManagerUpdate tests sequential configuration
// updates.
func TestManagerUpdate(t *testing.T) {
const defaultEvalInterval = time.Second * 30
currentEvalInterval := *evaluationInterval
*evaluationInterval = defaultEvalInterval
defer func() {
*evaluationInterval = currentEvalInterval
}()
var (
VMRows = &AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 10 * time.Second,
Labels: map[string]string{
"label": "bar",
"host": "{{ $labels.instance }}",
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
}
Conns = &AlertingRule{
Name: "Conns",
Expr: "sum(vm_tcplistener_conns) by(instance) > 1",
Annotations: map[string]string{
"summary": "Too high connection number for {{$labels.instance}}",
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
}
ExampleAlertAlwaysFiring = &AlertingRule{
Name: "ExampleAlertAlwaysFiring",
Expr: "sum by(job) (up == 1)",
}
)
testCases := []struct {
name string
initPath string
updatePath string
want []*Group
}{
{
name: "update good rules",
initPath: "config/testdata/rules0-good.rules",
updatePath: "config/testdata/dir/rules1-good.rules",
want: []*Group{
{
File: "config/testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Interval: defaultEvalInterval,
Rules: []Rule{
&AlertingRule{
Name: "VMRows",
Expr: "vm_rows > 0",
For: 5 * time.Minute,
Labels: map[string]string{"label": "bar"},
Annotations: map[string]string{
"summary": "{{ $value }}",
"description": "{{$labels}}",
},
},
},
},
},
},
{
name: "update good rules from 1 to 2 groups",
initPath: "config/testdata/dir/rules1-good.rules",
updatePath: "config/testdata/rules0-good.rules",
want: []*Group{
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Rules: []Rule{VMRows},
Interval: defaultEvalInterval,
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
{
name: "update with one bad rule file",
initPath: "config/testdata/rules0-good.rules",
updatePath: "config/testdata/dir/rules2-bad.rules",
want: []*Group{
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
}},
},
},
{
name: "update empty dir rules from 0 to 2 groups",
initPath: "config/testdata/empty/*",
updatePath: "config/testdata/rules0-good.rules",
want: []*Group{
{
File: "config/testdata/rules0-good.rules",
Name: "groupGorSingleAlert",
Interval: defaultEvalInterval,
Rules: []Rule{VMRows},
},
{
File: "config/testdata/rules0-good.rules",
Interval: defaultEvalInterval,
Name: "TestGroup", Rules: []Rule{
Conns,
ExampleAlertAlwaysFiring,
},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{groups: make(map[uint64]*Group), querier: &fakeQuerier{}}
path := []string{tc.initPath}
if err := m.update(ctx, path, true, true, false); err != nil {
t.Fatalf("failed to complete initial rules update: %s", err)
}
path = []string{tc.updatePath}
_ = m.update(ctx, path, true, true, false)
if len(tc.want) != len(m.groups) {
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
}
for _, wantG := range tc.want {
gotG, ok := m.groups[wantG.ID()]
if !ok {
t.Fatalf("expected to have group %q", wantG.Name)
}
compareGroups(t, wantG, gotG)
}
cancel()
m.close()
})
}
}

39
app/vmalert/metrics.go Normal file
View File

@@ -0,0 +1,39 @@
package main
import "github.com/VictoriaMetrics/metrics"
type gauge struct {
name string
*metrics.Gauge
}
func getOrCreateGauge(name string, f func() float64) *gauge {
return &gauge{
name: name,
Gauge: metrics.GetOrCreateGauge(name, f),
}
}
type counter struct {
name string
*metrics.Counter
}
func getOrCreateCounter(name string) *counter {
return &counter{
name: name,
Counter: metrics.GetOrCreateCounter(name),
}
}
type summary struct {
name string
*metrics.Summary
}
func getOrCreateSummary(name string) *summary {
return &summary{
name: name,
Summary: metrics.GetOrCreateSummary(name),
}
}

View File

@@ -0,0 +1,112 @@
package notifier
import (
"bytes"
"fmt"
"io"
"strings"
"text/template"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
// Alert the triggered alert
// TODO: Looks like alert name isn't unique
type Alert struct {
GroupID uint64
Name string
Labels map[string]string
Annotations map[string]string
State AlertState
Expr string
Start time.Time
End time.Time
Value float64
ID uint64
}
// AlertState type indicates the Alert state
type AlertState int
const (
// StateInactive is the state of an alert that is neither firing nor pending.
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
// String stringer for AlertState
func (as AlertState) String() string {
switch as {
case StateFiring:
return "firing"
case StatePending:
return "pending"
}
return "inactive"
}
type alertTplData struct {
Labels map[string]string
Value float64
Expr string
}
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
// ExecTemplate executes the Alert template for give
// map of annotations.
// Every alert could have a different datasource, so function
// requires a queryFunction as an argument.
func (a *Alert) ExecTemplate(q QueryFn, annotations map[string]string) (map[string]string, error) {
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
return templateAnnotations(annotations, tplData, funcsWithQuery(q))
}
// ValidateTemplates validate annotations for possible template error, uses empty data for template population
func ValidateTemplates(annotations map[string]string) error {
_, err := templateAnnotations(annotations, alertTplData{
Labels: map[string]string{},
Value: 0,
}, tmplFunc)
return err
}
func templateAnnotations(annotations map[string]string, data alertTplData, funcs template.FuncMap) (map[string]string, error) {
var builder strings.Builder
var buf bytes.Buffer
eg := new(utils.ErrGroup)
r := make(map[string]string, len(annotations))
for key, text := range annotations {
r[key] = text
buf.Reset()
builder.Reset()
builder.Grow(len(tplHeader) + len(text))
builder.WriteString(tplHeader)
builder.WriteString(text)
if err := templateAnnotation(&buf, builder.String(), data, funcs); err != nil {
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
continue
}
r[key] = buf.String()
}
return r, eg.Err()
}
func templateAnnotation(dst io.Writer, text string, data alertTplData, funcs template.FuncMap) error {
t := template.New("").Funcs(funcs).Option("missingkey=zero")
tpl, err := t.Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation: %w", err)
}
if err = tpl.Execute(dst, data); err != nil {
return fmt.Errorf("error evaluating annotation template: %w", err)
}
return nil
}

View File

@@ -0,0 +1,114 @@
package notifier
import (
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
func TestAlert_ExecTemplate(t *testing.T) {
testCases := []struct {
name string
alert *Alert
annotations map[string]string
expTpl map[string]string
}{
{
name: "empty-alert",
alert: &Alert{},
annotations: map[string]string{},
expTpl: map[string]string{},
},
{
name: "no-template",
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
"instance": "localhost",
},
},
annotations: map[string]string{},
expTpl: map[string]string{},
},
{
name: "label-template",
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
"job": "staging",
"instance": "localhost",
},
},
annotations: map[string]string{
"summary": "Too high connection number for {{$labels.instance}} for job {{$labels.job}}",
"description": "It is {{ $value }} connections for {{$labels.instance}}",
},
expTpl: map[string]string{
"summary": "Too high connection number for localhost for job staging",
"description": "It is 10000 connections for localhost",
},
},
{
name: "expression-template",
alert: &Alert{
Expr: `vm_rows{"label"="bar"}>0`,
},
annotations: map[string]string{
"exprEscapedQuery": "{{ $expr|quotesEscape|queryEscape }}",
"exprEscapedPath": "{{ $expr|quotesEscape|pathEscape }}",
},
expTpl: map[string]string{
"exprEscapedQuery": "vm_rows%7B%5C%22label%5C%22%3D%5C%22bar%5C%22%7D%3E0",
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
},
},
{
name: "query",
alert: &Alert{Expr: `vm_rows{"label"="bar"}>0`},
annotations: map[string]string{
"summary": `{{ query "foo" | first | value }}`,
"desc": `{{ range query "bar" }}{{ . | label "foo" }} {{ . | value }};{{ end }}`,
},
expTpl: map[string]string{
"summary": "1",
"desc": "bar 1;garply 2;",
},
},
}
qFn := func(q string) ([]datasource.Metric, error) {
return []datasource.Metric{
{
Labels: []datasource.Label{
{Name: "foo", Value: "bar"},
{Name: "baz", Value: "qux"},
},
Value: 1,
},
{
Labels: []datasource.Label{
{Name: "foo", Value: "garply"},
{Name: "baz", Value: "fred"},
},
Value: 2,
},
}, nil
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tpl, err := tc.alert.ExecTemplate(qFn, tc.annotations)
if err != nil {
t.Fatal(err)
}
if len(tpl) != len(tc.expTpl) {
t.Fatalf("expected %d elements; got %d", len(tc.expTpl), len(tpl))
}
for k := range tc.expTpl {
got, exp := tpl[k], tc.expTpl[k]
if got != exp {
t.Fatalf("expected %q=%q; got %q=%q", k, exp, k, got)
}
}
})
}
}

View File

@@ -0,0 +1,68 @@
package notifier
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
"strings"
)
// AlertManager represents integration provider with Prometheus alert manager
// https://github.com/prometheus/alertmanager
type AlertManager struct {
alertURL string
basicAuthUser string
basicAuthPass string
argFunc AlertURLGenerator
client *http.Client
}
// Send an alert or resolve message
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
b := &bytes.Buffer{}
writeamRequest(b, alerts, am.argFunc)
req, err := http.NewRequest("POST", am.alertURL, b)
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json; charset=utf-8")
req = req.WithContext(ctx)
if am.basicAuthPass != "" {
req.SetBasicAuth(am.basicAuthUser, am.basicAuthPass)
}
resp, err := am.client.Do(req)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response from %q: %w", am.alertURL, err)
}
return fmt.Errorf("invalid SC %d from %q; response body: %s", resp.StatusCode, am.alertURL, string(body))
}
return nil
}
// AlertURLGenerator returns URL to single alert by given name
type AlertURLGenerator func(Alert) string
const alertManagerPath = "/api/v2/alerts"
// NewAlertManager is a constructor for AlertManager
func NewAlertManager(alertManagerURL, user, pass string, fn AlertURLGenerator, c *http.Client) *AlertManager {
addr := strings.TrimSuffix(alertManagerURL, "/") + alertManagerPath
return &AlertManager{
alertURL: addr,
argFunc: fn,
client: c,
basicAuthUser: user,
basicAuthPass: pass,
}
}

View File

@@ -0,0 +1,33 @@
{% import (
"time"
) %}
{% stripspace %}
{% func amRequest(alerts []Alert, generatorURL func(Alert) string) %}
[
{% for i, alert := range alerts %}
{
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL(alert) %},
{% if !alert.End.IsZero() %}
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
{% endif %}
"labels": {
"alertname":{%q= alert.Name %}
{% for k,v := range alert.Labels %}
,{%q= k %}:{%q= v %}
{% endfor %}
},
"annotations": {
{% code c := len(alert.Annotations) %}
{% for k,v := range alert.Annotations %}
{% code c = c-1 %}
{%q= k %}:{%q= v %}{% if c > 0 %},{% endif %}
{% endfor %}
}
}
{% if i != len(alerts)-1 %},{% endif %}
{% endfor %}
]
{% endfunc %}
{% endstripspace %}

View File

@@ -0,0 +1,130 @@
// Code generated by qtc from "alertmanager_request.qtpl". DO NOT EDIT.
// See https://github.com/valyala/quicktemplate for details.
//line app/vmalert/notifier/alertmanager_request.qtpl:1
package notifier
//line app/vmalert/notifier/alertmanager_request.qtpl:1
import (
"time"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:6
import (
qtio422016 "io"
qt422016 "github.com/valyala/quicktemplate"
)
//line app/vmalert/notifier/alertmanager_request.qtpl:6
var (
_ = qtio422016.Copy
_ = qt422016.AcquireByteBuffer
)
//line app/vmalert/notifier/alertmanager_request.qtpl:6
func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:6
qw422016.N().S(`[`)
//line app/vmalert/notifier/alertmanager_request.qtpl:8
for i, alert := range alerts {
//line app/vmalert/notifier/alertmanager_request.qtpl:8
qw422016.N().S(`{"startsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:10
qw422016.N().Q(alert.Start.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:10
qw422016.N().S(`,"generatorURL":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().Q(generatorURL(alert))
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:12
if !alert.End.IsZero() {
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().S(`"endsAt":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:13
qw422016.N().Q(alert.End.Format(time.RFC3339Nano))
//line app/vmalert/notifier/alertmanager_request.qtpl:13
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:14
}
//line app/vmalert/notifier/alertmanager_request.qtpl:14
qw422016.N().S(`"labels": {"alertname":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:16
qw422016.N().Q(alert.Name)
//line app/vmalert/notifier/alertmanager_request.qtpl:17
for k, v := range alert.Labels {
//line app/vmalert/notifier/alertmanager_request.qtpl:17
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:18
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:19
}
//line app/vmalert/notifier/alertmanager_request.qtpl:19
qw422016.N().S(`},"annotations": {`)
//line app/vmalert/notifier/alertmanager_request.qtpl:22
c := len(alert.Annotations)
//line app/vmalert/notifier/alertmanager_request.qtpl:23
for k, v := range alert.Annotations {
//line app/vmalert/notifier/alertmanager_request.qtpl:24
c = c - 1
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().Q(k)
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().S(`:`)
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().Q(v)
//line app/vmalert/notifier/alertmanager_request.qtpl:25
if c > 0 {
//line app/vmalert/notifier/alertmanager_request.qtpl:25
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:25
}
//line app/vmalert/notifier/alertmanager_request.qtpl:26
}
//line app/vmalert/notifier/alertmanager_request.qtpl:26
qw422016.N().S(`}}`)
//line app/vmalert/notifier/alertmanager_request.qtpl:29
if i != len(alerts)-1 {
//line app/vmalert/notifier/alertmanager_request.qtpl:29
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:29
}
//line app/vmalert/notifier/alertmanager_request.qtpl:30
}
//line app/vmalert/notifier/alertmanager_request.qtpl:30
qw422016.N().S(`]`)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}
//line app/vmalert/notifier/alertmanager_request.qtpl:32
func writeamRequest(qq422016 qtio422016.Writer, alerts []Alert, generatorURL func(Alert) string) {
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qw422016 := qt422016.AcquireWriter(qq422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
streamamRequest(qw422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qt422016.ReleaseWriter(qw422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}
//line app/vmalert/notifier/alertmanager_request.qtpl:32
func amRequest(alerts []Alert, generatorURL func(Alert) string) string {
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qb422016 := qt422016.AcquireByteBuffer()
//line app/vmalert/notifier/alertmanager_request.qtpl:32
writeamRequest(qb422016, alerts, generatorURL)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qs422016 := string(qb422016.B)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
qt422016.ReleaseByteBuffer(qb422016)
//line app/vmalert/notifier/alertmanager_request.qtpl:32
return qs422016
//line app/vmalert/notifier/alertmanager_request.qtpl:32
}

View File

@@ -0,0 +1,91 @@
package notifier
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"strconv"
"testing"
"time"
)
func TestAlertManager_Send(t *testing.T) {
const baUser, baPass = "foo", "bar"
mux := http.NewServeMux()
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
t.Errorf("should not be called")
})
c := -1
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
user, pass, ok := r.BasicAuth()
if !ok {
t.Errorf("unauthorized request")
}
if user != baUser || pass != baPass {
t.Errorf("wrong creds %q:%q; expected %q:%q",
user, pass, baUser, baPass)
}
c++
if r.Method != http.MethodPost {
t.Errorf("expected POST method got %s", r.Method)
}
switch c {
case 0:
conn, _, _ := w.(http.Hijacker).Hijack()
_ = conn.Close()
case 1:
w.WriteHeader(500)
case 2:
var a []struct {
Labels map[string]string `json:"labels"`
StartsAt time.Time `json:"startsAt"`
EndAt time.Time `json:"endsAt"`
Annotations map[string]string `json:"annotations"`
GeneratorURL string `json:"generatorURL"`
}
if err := json.NewDecoder(r.Body).Decode(&a); err != nil {
t.Errorf("can not unmarshal data into alert %s", err)
t.FailNow()
}
if len(a) != 1 {
t.Errorf("expected 1 alert in array got %d", len(a))
}
if a[0].GeneratorURL != "0/0" {
t.Errorf("expected 0/0 as generatorURL got %s", a[0].GeneratorURL)
}
if a[0].Labels["alertname"] != "alert0" {
t.Errorf("expected alert0 as alert name got %s", a[0].Labels["alertname"])
}
if a[0].StartsAt.IsZero() {
t.Errorf("expected non-zero start time")
}
if a[0].EndAt.IsZero() {
t.Errorf("expected non-zero end time")
}
}
})
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewAlertManager(srv.URL, baUser, baPass, func(alert Alert) string {
return strconv.FormatUint(alert.GroupID, 10) + "/" + strconv.FormatUint(alert.ID, 10)
}, srv.Client())
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
t.Error("expected connection error got nil")
}
if err := am.Send(context.Background(), []Alert{}); err == nil {
t.Error("expected wrong http code error got nil")
}
if err := am.Send(context.Background(), []Alert{{
GroupID: 0,
Name: "alert0",
Start: time.Now().UTC(),
End: time.Now().UTC(),
Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
}}); err != nil {
t.Errorf("unexpected error %s", err)
}
if c != 2 {
t.Errorf("expected 2 calls(count from zero) to server got %d", c)
}
}

View File

@@ -0,0 +1,46 @@
package notifier
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
)
var (
addrs = flagutil.NewArray("notifier.url", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
basicAuthUsername = flagutil.NewArray("notifier.basicAuth.username", "Optional basic auth username for -datasource.url")
basicAuthPassword = flagutil.NewArray("notifier.basicAuth.password", "Optional basic auth password for -datasource.url")
tlsInsecureSkipVerify = flag.Bool("notifier.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -notifier.url")
tlsCertFile = flagutil.NewArray("notifier.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting to -notifier.url")
tlsKeyFile = flagutil.NewArray("notifier.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to -notifier.url")
tlsCAFile = flagutil.NewArray("notifier.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -notifier.url. "+
"By default system CA is used")
tlsServerName = flagutil.NewArray("notifier.tlsServerName", "Optional TLS server name to use for connections to -notifier.url. "+
"By default the server name from -notifier.url is used")
)
// Init creates a Notifier object based on provided flags.
func Init(gen AlertURLGenerator) ([]Notifier, error) {
if len(*addrs) == 0 {
return nil, fmt.Errorf("at least one `-notifier.url` must be set")
}
var notifiers []Notifier
for i, addr := range *addrs {
cert, key := tlsCertFile.GetOptionalArg(i), tlsKeyFile.GetOptionalArg(i)
ca, serverName := tlsCAFile.GetOptionalArg(i), tlsServerName.GetOptionalArg(i)
tr, err := utils.Transport(addr, cert, key, ca, serverName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
user, pass := basicAuthUsername.GetOptionalArg(i), basicAuthPassword.GetOptionalArg(i)
am := NewAlertManager(addr, user, pass, gen, &http.Client{Transport: tr})
notifiers = append(notifiers, am)
}
return notifiers, nil
}

View File

@@ -0,0 +1,8 @@
package notifier
import "context"
// Notifier is common interface for alert manager provider
type Notifier interface {
Send(ctx context.Context, alerts []Alert) error
}

View File

@@ -0,0 +1,13 @@
package notifier
import (
"net/url"
"os"
"testing"
)
func TestMain(m *testing.M) {
u, _ := url.Parse("https://victoriametrics.com/path")
InitTemplateFunc(u)
os.Exit(m.Run())
}

View File

@@ -0,0 +1,217 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package notifier
import (
"errors"
"fmt"
"math"
"net/url"
"regexp"
"strings"
"time"
htmlTpl "html/template"
textTpl "text/template"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
)
// QueryFn is used to wrap a call to datasource into simple-to-use function
// for templating functions.
type QueryFn func(query string) ([]datasource.Metric, error)
func funcsWithQuery(query QueryFn) textTpl.FuncMap {
fm := make(textTpl.FuncMap)
for k, fn := range tmplFunc {
fm[k] = fn
}
fm["query"] = func(q string) ([]datasource.Metric, error) {
return query(q)
}
return fm
}
var tmplFunc textTpl.FuncMap
// InitTemplateFunc initiates template helper functions
func InitTemplateFunc(externalURL *url.URL) {
tmplFunc = textTpl.FuncMap{
"args": func(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
result[fmt.Sprintf("arg%d", i)] = a
}
return result
},
"reReplaceAll": func(pattern, repl, text string) string {
re := regexp.MustCompile(pattern)
return re.ReplaceAllString(text, repl)
},
"safeHtml": func(text string) htmlTpl.HTML {
return htmlTpl.HTML(text)
},
"match": regexp.MatchString,
"title": strings.Title,
"toUpper": strings.ToUpper,
"toLower": strings.ToLower,
"humanize": func(v float64) string {
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
if math.Abs(v) >= 1 {
prefix := ""
for _, p := range []string{"k", "M", "G", "T", "P", "E", "Z", "Y"} {
if math.Abs(v) < 1000 {
break
}
prefix = p
v /= 1000
}
return fmt.Sprintf("%.4g%s", v, prefix)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
"humanize1024": func(v float64) string {
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
prefix := ""
for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} {
if math.Abs(v) < 1024 {
break
}
prefix = p
v /= 1024
}
return fmt.Sprintf("%.4g%s", v, prefix)
},
"humanizeDuration": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
if v == 0 {
return fmt.Sprintf("%.4gs", v)
}
if math.Abs(v) >= 1 {
sign := ""
if v < 0 {
sign = "-"
v = -v
}
seconds := int64(v) % 60
minutes := (int64(v) / 60) % 60
hours := (int64(v) / 60 / 60) % 24
days := int64(v) / 60 / 60 / 24
// For days to minutes, we display seconds as an integer.
if days != 0 {
return fmt.Sprintf("%s%dd %dh %dm %ds", sign, days, hours, minutes, seconds)
}
if hours != 0 {
return fmt.Sprintf("%s%dh %dm %ds", sign, hours, minutes, seconds)
}
if minutes != 0 {
return fmt.Sprintf("%s%dm %ds", sign, minutes, seconds)
}
// For seconds, we display 4 significant digits.
return fmt.Sprintf("%s%.4gs", sign, v)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.4g%ss", v, prefix)
},
"humanizePercentage": func(v float64) string {
return fmt.Sprintf("%.4g%%", v*100)
},
"humanizeTimestamp": func(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
t := TimeFromUnixNano(int64(v * 1e9)).Time().UTC()
return fmt.Sprint(t)
},
"pathPrefix": func() string {
return externalURL.Path
},
"externalURL": func() string {
return externalURL.String()
},
"pathEscape": func(u string) string {
return url.PathEscape(u)
},
"queryEscape": func(q string) string {
return url.QueryEscape(q)
},
"quotesEscape": func(q string) string {
return strings.Replace(q, `"`, `\"`, -1)
},
// query function supposed to be substituted at funcsWithQuery().
// it is present here only for validation purposes, when there is no
// provided datasource.
"query": func(q string) ([]datasource.Metric, error) {
return nil, nil
},
"first": func(metrics []datasource.Metric) (datasource.Metric, error) {
if len(metrics) > 0 {
return metrics[0], nil
}
return datasource.Metric{}, errors.New("first() called on vector with no elements")
},
"label": func(label string, m datasource.Metric) string {
return m.Label(label)
},
"value": func(m datasource.Metric) float64 {
return m.Value
},
}
}
// Time is the number of milliseconds since the epoch
// (1970-01-01 00:00 UTC) excluding leap seconds.
type Time int64
// TimeFromUnixNano returns the Time equivalent to the Unix Time
// t provided in nanoseconds.
func TimeFromUnixNano(t int64) Time {
return Time(t / nanosPerTick)
}
// The number of nanoseconds per minimum tick.
const nanosPerTick = int64(minimumTick / time.Nanosecond)
// MinimumTick is the minimum supported time resolution. This has to be
// at least time.Second in order for the code below to work.
const minimumTick = time.Millisecond
// second is the Time duration equivalent to one second.
const second = int64(time.Second / minimumTick)
// Time returns the time.Time representation of t.
func (t Time) Time() time.Time {
return time.Unix(int64(t)/second, (int64(t)%second)*nanosPerTick)
}

170
app/vmalert/recording.go Normal file
View File

@@ -0,0 +1,170 @@
package main
import (
"context"
"fmt"
"hash/fnv"
"sort"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
// RecordingRule is a Rule that supposed
// to evaluate configured Expression and
// return TimeSeries as result.
type RecordingRule struct {
RuleID uint64
Name string
Expr string
Labels map[string]string
GroupID uint64
// guard status fields
mu sync.RWMutex
// stores last moment of time Exec was called
lastExecTime time.Time
// stores last error that happened in Exec func
// resets on every successful Exec
// may be used as Health state
lastExecError error
metrics *recordingRuleMetrics
}
type recordingRuleMetrics struct {
errors *gauge
}
// String implements Stringer interface
func (rr *RecordingRule) String() string {
return rr.Name
}
// ID returns unique Rule ID
// within the parent Group.
func (rr *RecordingRule) ID() uint64 {
return rr.RuleID
}
func newRecordingRule(group *Group, cfg config.Rule) *RecordingRule {
rr := &RecordingRule{
RuleID: cfg.ID,
Name: cfg.Record,
Expr: cfg.Expr,
Labels: cfg.Labels,
GroupID: group.ID(),
metrics: &recordingRuleMetrics{},
}
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
func() float64 {
rr.mu.Lock()
defer rr.mu.Unlock()
if rr.lastExecError == nil {
return 0
}
return 1
})
return rr
}
// Close unregisters rule metrics
func (rr *RecordingRule) Close() {
metrics.UnregisterMetric(rr.metrics.errors.name)
}
// Exec executes RecordingRule expression via the given Querier.
func (rr *RecordingRule) Exec(ctx context.Context, q datasource.Querier, series bool) ([]prompbmarshal.TimeSeries, error) {
if !series {
return nil, nil
}
qMetrics, err := q.Query(ctx, rr.Expr)
rr.mu.Lock()
defer rr.mu.Unlock()
rr.lastExecTime = time.Now()
rr.lastExecError = err
if err != nil {
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
}
duplicates := make(map[uint64]prompbmarshal.TimeSeries, len(qMetrics))
var tss []prompbmarshal.TimeSeries
for _, r := range qMetrics {
ts := rr.toTimeSeries(r, rr.lastExecTime)
h := hashTimeSeries(ts)
if _, ok := duplicates[h]; ok {
rr.lastExecError = errDuplicate
return nil, errDuplicate
}
duplicates[h] = ts
tss = append(tss, ts)
}
return tss, nil
}
func hashTimeSeries(ts prompbmarshal.TimeSeries) uint64 {
hash := fnv.New64a()
labels := ts.Labels
sort.Slice(labels, func(i, j int) bool {
return labels[i].Name < labels[j].Name
})
for _, l := range labels {
hash.Write([]byte(l.Name))
hash.Write([]byte(l.Value))
hash.Write([]byte("\xff"))
}
return hash.Sum64()
}
func (rr *RecordingRule) toTimeSeries(m datasource.Metric, timestamp time.Time) prompbmarshal.TimeSeries {
labels := make(map[string]string)
for _, l := range m.Labels {
labels[l.Name] = l.Value
}
labels["__name__"] = rr.Name
// override existing labels with configured ones
for k, v := range rr.Labels {
labels[k] = v
}
return newTimeSeries(m.Value, labels, timestamp)
}
// UpdateWith copies all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
func (rr *RecordingRule) UpdateWith(r Rule) error {
nr, ok := r.(*RecordingRule)
if !ok {
return fmt.Errorf("BUG: attempt to update recroding rule with wrong type %#v", r)
}
rr.Expr = nr.Expr
rr.Labels = nr.Labels
return nil
}
// RuleAPI returns Rule representation in form
// of APIRecordingRule
func (rr *RecordingRule) RuleAPI() APIRecordingRule {
var lastErr string
if rr.lastExecError != nil {
lastErr = rr.lastExecError.Error()
}
return APIRecordingRule{
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", rr.ID()),
GroupID: fmt.Sprintf("%d", rr.GroupID),
Name: rr.Name,
Expression: rr.Expr,
LastError: lastErr,
LastExec: rr.lastExecTime,
Labels: rr.Labels,
}
}

View File

@@ -0,0 +1,121 @@
package main
import (
"context"
"errors"
"strings"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestRecoridngRule_ToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *RecordingRule
metrics []datasource.Metric
expTS []prompbmarshal.TimeSeries
}{
{
&RecordingRule{Name: "foo"},
[]datasource.Metric{metricWithValueAndLabels(t, 10,
"__name__", "bar",
)},
[]prompbmarshal.TimeSeries{
newTimeSeries(10, map[string]string{
"__name__": "foo",
}, timestamp),
},
},
{
&RecordingRule{Name: "foobarbaz"},
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 2, "__name__", "bar", "job", "bar"),
metricWithValueAndLabels(t, 3, "__name__", "baz", "job", "baz"),
},
[]prompbmarshal.TimeSeries{
newTimeSeries(1, map[string]string{
"__name__": "foobarbaz",
"job": "foo",
}, timestamp),
newTimeSeries(2, map[string]string{
"__name__": "foobarbaz",
"job": "bar",
}, timestamp),
newTimeSeries(3, map[string]string{
"__name__": "foobarbaz",
"job": "baz",
}, timestamp),
},
},
{
&RecordingRule{Name: "job:foo", Labels: map[string]string{
"source": "test",
}},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "foo"),
metricWithValueAndLabels(t, 1, "__name__", "bar", "job", "bar")},
[]prompbmarshal.TimeSeries{
newTimeSeries(2, map[string]string{
"__name__": "job:foo",
"job": "foo",
"source": "test",
}, timestamp),
newTimeSeries(1, map[string]string{
"__name__": "job:foo",
"job": "bar",
"source": "test",
}, timestamp),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &fakeQuerier{}
fq.add(tc.metrics...)
tss, err := tc.rule.Exec(context.TODO(), fq, true)
if err != nil {
t.Fatalf("unexpected Exec err: %s", err)
}
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func TestRecoridngRule_ToTimeSeriesNegative(t *testing.T) {
rr := &RecordingRule{Name: "job:foo", Labels: map[string]string{
"job": "test",
}}
fq := &fakeQuerier{}
expErr := "connection reset by peer"
fq.setErr(errors.New(expErr))
_, err := rr.Exec(context.TODO(), fq, true)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
fq.reset()
// add metrics which differs only by `job` label
// which will be overridden by rule
fq.add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "foo"))
fq.add(metricWithValueAndLabels(t, 2, "__name__", "foo", "job", "bar"))
_, err = rr.Exec(context.TODO(), fq, true)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), errDuplicate.Error()) {
t.Fatalf("expected to get err %q; got %q insterad", errDuplicate, err)
}
}

View File

@@ -0,0 +1,39 @@
package remoteread
import (
"flag"
"fmt"
"net/http"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
basicAuthPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
tlsInsecureSkipVerify = flag.Bool("remoteRead.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteRead.url")
tlsCertFile = flag.String("remoteRead.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteRead.url")
tlsKeyFile = flag.String("remoteRead.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteRead.url")
tlsCAFile = flag.String("remoteRead.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteRead.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteRead.tlsServerName", "", "Optional TLS server name to use for connections to -remoteRead.url. "+
"By default the server name from -remoteRead.url is used")
)
// Init creates a Querier from provided flag values.
// Returns nil if addr flag wasn't set.
func Init() (datasource.Querier, error) {
if *addr == "" {
return nil, nil
}
tr, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
c := &http.Client{Transport: tr}
return datasource.NewVMStorage(*addr, *basicAuthUsername, *basicAuthPassword, 0, c), nil
}

View File

@@ -0,0 +1,54 @@
package remotewrite
import (
"context"
"flag"
"fmt"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
)
var (
addr = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" and recording rules results in form of timeseries. E.g. http://127.0.0.1:8428")
basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
maxQueueSize = flag.Int("remoteWrite.maxQueueSize", 1e5, "Defines the max number of pending datapoints to remote write endpoint")
maxBatchSize = flag.Int("remoteWrite.maxBatchSize", 1e3, "Defines defines max number of timeseries to be flushed at once")
concurrency = flag.Int("remoteWrite.concurrency", 1, "Defines number of writers for concurrent writing into remote querier")
flushInterval = flag.Duration("remoteWrite.flushInterval", 5*time.Second, "Defines interval of flushes to remote write endpoint")
tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
tlsCertFile = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
tlsKeyFile = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
tlsCAFile = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used")
tlsServerName = flag.String("remoteWrite.tlsServerName", "", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used")
)
// Init creates Client object from given flags.
// Returns nil if addr flag wasn't set.
func Init(ctx context.Context) (*Client, error) {
if *addr == "" {
return nil, nil
}
t, err := utils.Transport(*addr, *tlsCertFile, *tlsKeyFile, *tlsCAFile, *tlsServerName, *tlsInsecureSkipVerify)
if err != nil {
return nil, fmt.Errorf("failed to create transport: %w", err)
}
return NewClient(ctx, Config{
Addr: *addr,
Concurrency: *concurrency,
MaxQueueSize: *maxQueueSize,
MaxBatchSize: *maxBatchSize,
FlushInterval: *flushInterval,
BasicAuthUser: *basicAuthUsername,
BasicAuthPass: *basicAuthPassword,
Transport: t,
})
}

View File

@@ -0,0 +1,246 @@
package remotewrite
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
"github.com/golang/snappy"
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c *http.Client
input chan prompbmarshal.TimeSeries
baUser, baPass string
flushInterval time.Duration
maxBatchSize int
maxQueueSize int
wg sync.WaitGroup
doneCh chan struct{}
}
// Config is config for remote write.
type Config struct {
// Addr of remote storage
Addr string
BasicAuthUser string
BasicAuthPass string
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time.Duration
// WriteTimeout defines timeout for HTTP write request
// to remote storage
WriteTimeout time.Duration
// Transport will be used by the underlying http.Client
Transport *http.Transport
}
const (
defaultConcurrency = 4
defaultMaxBatchSize = 1e3
defaultMaxQueueSize = 1e5
defaultFlushInterval = 5 * time.Second
defaultWriteTimeout = 30 * time.Second
)
const writePath = "/api/v1/write"
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient(ctx context.Context, cfg Config) (*Client, error) {
if cfg.Addr == "" {
return nil, fmt.Errorf("config.Addr can't be empty")
}
if cfg.MaxBatchSize == 0 {
cfg.MaxBatchSize = defaultMaxBatchSize
}
if cfg.MaxQueueSize == 0 {
cfg.MaxQueueSize = defaultMaxQueueSize
}
if cfg.FlushInterval == 0 {
cfg.FlushInterval = defaultFlushInterval
}
if cfg.WriteTimeout == 0 {
cfg.WriteTimeout = defaultWriteTimeout
}
if cfg.Transport == nil {
cfg.Transport = http.DefaultTransport.(*http.Transport).Clone()
}
c := &Client{
c: &http.Client{
Timeout: cfg.WriteTimeout,
Transport: cfg.Transport,
},
addr: strings.TrimSuffix(cfg.Addr, "/") + writePath,
baUser: cfg.BasicAuthUser,
baPass: cfg.BasicAuthPass,
flushInterval: cfg.FlushInterval,
maxBatchSize: cfg.MaxBatchSize,
maxQueueSize: cfg.MaxQueueSize,
doneCh: make(chan struct{}),
input: make(chan prompbmarshal.TimeSeries, cfg.MaxQueueSize),
}
cc := defaultConcurrency
if cfg.Concurrency > 0 {
cc = cfg.Concurrency
}
for i := 0; i < cc; i++ {
c.run(ctx)
}
return c, nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
select {
case <-c.doneCh:
return fmt.Errorf("client is closed")
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}
// Close stops the client and waits for all goroutines
// to exit.
func (c *Client) Close() error {
if c.doneCh == nil {
return fmt.Errorf("client is already closed")
}
close(c.input)
close(c.doneCh)
c.wg.Wait()
return nil
}
func (c *Client) run(ctx context.Context) {
ticker := time.NewTicker(c.flushInterval)
wr := &prompbmarshal.WriteRequest{}
shutdown := func() {
for ts := range c.input {
wr.Timeseries = append(wr.Timeseries, ts)
}
lastCtx, cancel := context.WithTimeout(context.Background(), defaultWriteTimeout)
c.flush(lastCtx, wr)
cancel()
}
c.wg.Add(1)
go func() {
defer c.wg.Done()
defer ticker.Stop()
for {
select {
case <-c.doneCh:
shutdown()
return
case <-ctx.Done():
shutdown()
return
case <-ticker.C:
c.flush(ctx, wr)
case ts, ok := <-c.input:
if !ok {
continue
}
wr.Timeseries = append(wr.Timeseries, ts)
if len(wr.Timeseries) >= c.maxBatchSize {
c.flush(ctx, wr)
}
}
}
}()
}
var (
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote write endpoint. Flush performs limited amount of retries
// if request fails.
func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
if len(wr.Timeseries) < 1 {
return
}
defer prompbmarshal.ResetWriteRequest(wr)
data, err := wr.Marshal()
if err != nil {
logger.Errorf("failed to marshal WriteRequest: %s", err)
return
}
const attempts = 5
b := snappy.Encode(nil, data)
for i := 0; i < attempts; i++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
sentBytes.Add(len(b))
return
}
logger.Errorf("attempt %d to send request failed: %s", i+1, err)
// sleeping to avoid remote db hammering
time.Sleep(time.Second)
continue
}
droppedRows.Add(len(wr.Timeseries))
droppedBytes.Add(len(b))
logger.Errorf("all %d attempts to send request failed - dropping %d timeseries",
attempts, len(wr.Timeseries))
}
func (c *Client) send(ctx context.Context, data []byte) error {
r := bytes.NewReader(data)
req, err := http.NewRequest("POST", c.addr, r)
if err != nil {
return fmt.Errorf("failed to create new HTTP request: %w", err)
}
if c.baPass != "" {
req.SetBasicAuth(c.baUser, c.baPass)
}
resp, err := c.c.Do(req.WithContext(ctx))
if err != nil {
return fmt.Errorf("error while sending request to %s: %w; Data len %d(%d)",
req.URL, err, len(data), r.Size())
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusNoContent {
body, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("unexpected response code %d for %s. Response body %q",
resp.StatusCode, req.URL, body)
}
return nil
}

View File

@@ -0,0 +1,102 @@
package remotewrite
import (
"context"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
"github.com/golang/snappy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func TestClient_Push(t *testing.T) {
testSrv := newRWServer()
cfg := Config{
Addr: testSrv.URL,
MaxBatchSize: 100,
}
client, err := NewClient(context.Background(), cfg)
if err != nil {
t.Fatalf("failed to create client: %s", err)
}
const rowsN = 1e4
var sent int
for i := 0; i < rowsN; i++ {
s := prompbmarshal.TimeSeries{
Samples: []prompbmarshal.Sample{{
Value: rand.Float64(),
Timestamp: time.Now().Unix(),
}},
}
err := client.Push(s)
if err == nil {
sent++
}
}
if sent == 0 {
t.Fatalf("0 series sent")
}
if err := client.Close(); err != nil {
t.Fatalf("failed to close client: %s", err)
}
got := testSrv.accepted()
if got != sent {
t.Fatalf("expected to have %d series; got %d", sent, got)
}
}
func newRWServer() *rwServer {
rw := &rwServer{}
rw.Server = httptest.NewServer(http.HandlerFunc(rw.handler))
return rw
}
type rwServer struct {
// WARN: ordering of fields is important for alignment!
// see https://golang.org/pkg/sync/atomic/#pkg-note-BUG
acceptedRows uint64
*httptest.Server
}
func (rw *rwServer) accepted() int {
return int(atomic.LoadUint64(&rw.acceptedRows))
}
func (rw *rwServer) err(w http.ResponseWriter, err error) {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(err.Error()))
}
func (rw *rwServer) handler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
rw.err(w, fmt.Errorf("bad method %q", r.Method))
return
}
data, err := ioutil.ReadAll(r.Body)
if err != nil {
rw.err(w, fmt.Errorf("body read err: %w", err))
return
}
defer func() { _ = r.Body.Close() }()
b, err := snappy.Decode(nil, data)
if err != nil {
rw.err(w, fmt.Errorf("decode err: %w", err))
return
}
wr := &prompb.WriteRequest{}
if err := wr.Unmarshal(b); err != nil {
rw.err(w, fmt.Errorf("unmarhsal err: %w", err))
return
}
atomic.AddUint64(&rw.acceptedRows, uint64(len(wr.Timeseries)))
w.WriteHeader(http.StatusNoContent)
}

30
app/vmalert/rule.go Normal file
View File

@@ -0,0 +1,30 @@
package main
import (
"context"
"errors"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
// Rule represents alerting or recording rule
// that has unique ID, can be Executed and
// updated with other Rule.
type Rule interface {
// Returns unique ID that may be used for
// identifying this Rule among others.
ID() uint64
// Exec executes the rule with given context
// and Querier. If returnSeries is true, Exec
// may return TimeSeries as result of execution
Exec(ctx context.Context, q datasource.Querier, returnSeries bool) ([]prompbmarshal.TimeSeries, error)
// UpdateWith performs modification of current Rule
// with fields of the given Rule.
UpdateWith(Rule) error
// Close performs the shutdown procedures for rule
// such as metrics unregister
Close()
}
var errDuplicate = errors.New("result contains metrics with the same labelset after applying rule labels")

28
app/vmalert/utils.go Normal file
View File

@@ -0,0 +1,28 @@
package main
import (
"sort"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)
func newTimeSeries(value float64, labels map[string]string, timestamp time.Time) prompbmarshal.TimeSeries {
ts := prompbmarshal.TimeSeries{}
ts.Samples = append(ts.Samples, prompbmarshal.Sample{
Value: value,
Timestamp: timestamp.UnixNano() / 1e6,
})
keys := make([]string, 0, len(labels))
for k := range labels {
keys = append(keys, k)
}
sort.Strings(keys)
for _, key := range keys {
ts.Labels = append(ts.Labels, prompbmarshal.Label{
Name: key,
Value: labels[key],
})
}
return ts
}

View File

@@ -0,0 +1,43 @@
package utils
import (
"fmt"
"strings"
)
// ErrGroup accumulates multiple errors
// and produces single error message.
type ErrGroup struct {
errs []error
}
// Add adds a new error to group.
// Isn't thread-safe.
func (eg *ErrGroup) Add(err error) {
eg.errs = append(eg.errs, err)
}
// Err checks if group contains at least
// one error.
func (eg *ErrGroup) Err() error {
if eg == nil || len(eg.errs) == 0 {
return nil
}
return eg
}
// Error satisfies Error interface
func (eg *ErrGroup) Error() string {
if len(eg.errs) == 0 {
return ""
}
var b strings.Builder
fmt.Fprintf(&b, "errors(%d): ", len(eg.errs))
for i, err := range eg.errs {
b.WriteString(err.Error())
if i != len(eg.errs)-1 {
b.WriteString("\n")
}
}
return b.String()
}

View File

@@ -0,0 +1,38 @@
package utils
import (
"errors"
"testing"
)
func TestErrGroup(t *testing.T) {
testCases := []struct {
errs []error
exp string
}{
{nil, ""},
{[]error{errors.New("timeout")}, "errors(1): timeout"},
{
[]error{errors.New("timeout"), errors.New("deadline")},
"errors(2): timeout\ndeadline",
},
}
for _, tc := range testCases {
eg := new(ErrGroup)
for _, err := range tc.errs {
eg.Add(err)
}
if len(tc.errs) == 0 {
if eg.Err() != nil {
t.Fatalf("expected to get nil error")
}
continue
}
if eg.Err() == nil {
t.Fatalf("expected to get non-nil error")
}
if eg.Error() != tc.exp {
t.Fatalf("expected to have: \n%q\ngot:\n%q", tc.exp, eg.Error())
}
}
}

58
app/vmalert/utils/tls.go Normal file
View File

@@ -0,0 +1,58 @@
package utils
import (
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
"net/http"
"strings"
)
// Transport creates http.Transport object based on provided URL.
// Returns Transport with TLS configuration if URL contains `https` prefix
func Transport(URL, certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*http.Transport, error) {
t := http.DefaultTransport.(*http.Transport).Clone()
if !strings.HasPrefix(URL, "https") {
return t, nil
}
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
return nil, err
}
t.TLSClientConfig = tlsCfg
return t, nil
}
// TLSConfig creates tls.Config object from provided arguments
func TLSConfig(certFile, keyFile, CAFile, serverName string, insecureSkipVerify bool) (*tls.Config, error) {
var certs []tls.Certificate
if certFile != "" {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate from `cert_file`=%q, `key_file`=%q: %w", certFile, keyFile, err)
}
certs = []tls.Certificate{cert}
}
var rootCAs *x509.CertPool
if CAFile != "" {
pem, err := ioutil.ReadFile(CAFile)
if err != nil {
return nil, fmt.Errorf("cannot read `ca_file` %q: %w", CAFile, err)
}
rootCAs = x509.NewCertPool()
if !rootCAs.AppendCertsFromPEM(pem) {
return nil, fmt.Errorf("cannot parse data from `ca_file` %q", CAFile)
}
}
return &tls.Config{
Certificates: certs,
InsecureSkipVerify: insecureSkipVerify,
RootCAs: rootCAs,
ServerName: serverName,
}, nil
}

View File

@@ -0,0 +1,52 @@
package utils
import "testing"
func TestTLSConfig(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
serverName = "test"
insecureSkipVerify = true
tlsCfg, err := TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tlsCfg == nil {
t.Errorf("expected tlsConfig to be set, got nil")
}
if tlsCfg.ServerName != serverName {
t.Errorf("unexpected ServerName, want %s, got %s", serverName, tlsCfg.ServerName)
}
if tlsCfg.InsecureSkipVerify != insecureSkipVerify {
t.Errorf("unexpected InsecureSkipVerify, want %v, got %v", insecureSkipVerify, tlsCfg.InsecureSkipVerify)
}
certFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected keypair error, got nil")
}
certFile = ""
CAFile = "/path/to/nonexisting/cert/file"
_, err = TLSConfig(certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err == nil {
t.Errorf("expected read error, got nil")
}
}
func TestTransport(t *testing.T) {
var certFile, keyFile, CAFile, serverName string
var insecureSkipVerify bool
URL := "http://victoriametrics.com"
_, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
URL = "https://victoriametrics.com"
tr, err := Transport(URL, certFile, keyFile, CAFile, serverName, insecureSkipVerify)
if err != nil {
t.Errorf("unexpected error %s", err)
}
if tr.TLSClientConfig == nil {
t.Errorf("expected TLSClientConfig to be set, got nil")
}
}

185
app/vmalert/web.go Normal file
View File

@@ -0,0 +1,185 @@
package main
import (
"encoding/json"
"fmt"
"net/http"
"sort"
"strconv"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
type requestHandler struct {
m *manager
}
var pathList = [][]string{
{"/api/v1/groups", "list all loaded groups and rules"},
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
// /metrics is served by httpserver by default
{"/metrics", "list of application metrics"},
{"/-/reload", "reload configuration"},
}
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
switch r.URL.Path {
case "/":
for _, path := range pathList {
p, doc := path[0], path[1]
fmt.Fprintf(w, "<a href='%s'>%q</a> - %s<br/>", p, p, doc)
}
return true
case "/api/v1/groups":
data, err := rh.listGroups()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
case "/api/v1/alerts":
data, err := rh.listAlerts()
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
case "/-/reload":
logger.Infof("api config reload was called, sending sighup")
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
default:
if !strings.HasSuffix(r.URL.Path, "/status") {
return false
}
// /api/v1/<groupName>/<alertID>/status
data, err := rh.alert(r.URL.Path)
if err != nil {
httpserver.Errorf(w, r, "error in %q: %s", r.URL.Path, err)
return true
}
w.Header().Set("Content-Type", "application/json; charset=utf-8")
w.Write(data)
return true
}
}
type listGroupsResponse struct {
Data struct {
Groups []APIGroup `json:"groups"`
} `json:"data"`
Status string `json:"status"`
}
func (rh *requestHandler) listGroups() ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
lr := listGroupsResponse{Status: "success"}
for _, g := range rh.m.groups {
lr.Data.Groups = append(lr.Data.Groups, g.toAPI())
}
// sort list of alerts for deterministic output
sort.Slice(lr.Data.Groups, func(i, j int) bool {
return lr.Data.Groups[i].Name < lr.Data.Groups[j].Name
})
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
return b, nil
}
type listAlertsResponse struct {
Data struct {
Alerts []*APIAlert `json:"alerts"`
} `json:"data"`
Status string `json:"status"`
}
func (rh *requestHandler) listAlerts() ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
for _, g := range rh.m.groups {
for _, r := range g.Rules {
a, ok := r.(*AlertingRule)
if !ok {
continue
}
lr.Data.Alerts = append(lr.Data.Alerts, a.AlertsAPI()...)
}
}
// sort list of alerts for deterministic output
sort.Slice(lr.Data.Alerts, func(i, j int) bool {
return lr.Data.Alerts[i].ID < lr.Data.Alerts[j].ID
})
b, err := json.Marshal(lr)
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`error encoding list of active alerts: %w`, err),
StatusCode: http.StatusInternalServerError,
}
}
return b, nil
}
func (rh *requestHandler) alert(path string) ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
parts := strings.SplitN(strings.TrimPrefix(path, "/api/v1/"), "/", 3)
if len(parts) != 3 {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`path %q cointains /status suffix but doesn't match pattern "/group/alert/status"`, path),
StatusCode: http.StatusBadRequest,
}
}
groupID, err := uint64FromPath(parts[0])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %w`, err))
}
alertID, err := uint64FromPath(parts[1])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %w`, err))
}
resp, err := rh.m.AlertAPI(groupID, alertID)
if err != nil {
return nil, errResponse(err, http.StatusNotFound)
}
return json.Marshal(resp)
}
func uint64FromPath(path string) (uint64, error) {
s := strings.TrimRight(path, "/")
return strconv.ParseUint(s, 10, 0)
}
func badRequest(err error) *httpserver.ErrorWithStatusCode {
return errResponse(err, http.StatusBadRequest)
}
func errResponse(err error, sc int) *httpserver.ErrorWithStatusCode {
return &httpserver.ErrorWithStatusCode{
Err: err,
StatusCode: sc,
}
}

81
app/vmalert/web_test.go Normal file
View File

@@ -0,0 +1,81 @@
package main
import (
"encoding/json"
"net/http"
"net/http/httptest"
"reflect"
"testing"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestHandler(t *testing.T) {
ar := &AlertingRule{
Name: "alert",
alerts: map[uint64]*notifier.Alert{
0: {},
},
}
g := &Group{
Name: "group",
Rules: []Rule{ar},
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
rh := &requestHandler{m: m}
getResp := func(url string, to interface{}, code int) {
t.Helper()
resp, err := http.Get(url)
if err != nil {
t.Errorf("unexpected err %s", err)
}
if code != resp.StatusCode {
t.Errorf("unexpected status code %d want %d", resp.StatusCode, code)
}
defer func() {
if err := resp.Body.Close(); err != nil {
t.Errorf("err closing body %s", err)
}
}()
if to != nil {
if err = json.NewDecoder(resp.Body).Decode(to); err != nil {
t.Errorf("unexpected err %s", err)
}
}
}
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { rh.handler(w, r) }))
defer ts.Close()
t.Run("/api/v1/alerts", func(t *testing.T) {
lr := listAlertsResponse{}
getResp(ts.URL+"/api/v1/alerts", &lr, 200)
if length := len(lr.Data.Alerts); length != 1 {
t.Errorf("expected 1 alert got %d", length)
}
})
t.Run("/api/v1/groups", func(t *testing.T) {
lr := listGroupsResponse{}
getResp(ts.URL+"/api/v1/groups", &lr, 200)
if length := len(lr.Data.Groups); length != 1 {
t.Errorf("expected 1 group got %d", length)
}
})
t.Run("/api/v1/0/0/status", func(t *testing.T) {
alert := &APIAlert{}
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
expAlert := ar.newAlertAPI(*ar.alerts[0])
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}
})
t.Run("/api/v1/0/1/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/0/1/status", nil, 404)
})
t.Run("/api/v1/1/0/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
})
t.Run("/", func(t *testing.T) {
getResp(ts.URL, nil, 200)
})
}

54
app/vmalert/web_types.go Normal file
View File

@@ -0,0 +1,54 @@
package main
import (
"time"
)
// APIAlert represents an notifier.AlertingRule state
// for WEB view
type APIAlert struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
State string `json:"state"`
Value string `json:"value"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
ActiveAt time.Time `json:"activeAt"`
}
// APIGroup represents Group for WEB view
type APIGroup struct {
Name string `json:"name"`
ID string `json:"id"`
File string `json:"file"`
Interval string `json:"interval"`
Concurrency int `json:"concurrency"`
AlertingRules []APIAlertingRule `json:"alerting_rules"`
RecordingRules []APIRecordingRule `json:"recording_rules"`
}
// APIAlertingRule represents AlertingRule for WEB view
type APIAlertingRule struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
For string `json:"for"`
LastError string `json:"last_error"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
}
// APIRecordingRule represents RecordingRule for WEB view
type APIRecordingRule struct {
ID string `json:"id"`
Name string `json:"name"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
LastError string `json:"last_error"`
LastExec time.Time `json:"last_exec"`
Labels map[string]string `json:"labels"`
}

79
app/vmauth/Makefile Normal file
View File

@@ -0,0 +1,79 @@
# All these commands must run from repository root.
vmauth:
APP_NAME=vmauth $(MAKE) app-local
vmauth-race:
APP_NAME=vmauth RACE=-race $(MAKE) app-local
vmauth-prod:
APP_NAME=vmauth $(MAKE) app-via-docker
vmauth-pure-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-pure
vmauth-amd64-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-amd64
vmauth-arm-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-arm
vmauth-arm64-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-arm64
vmauth-ppc64le-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-ppc64le
vmauth-386-prod:
APP_NAME=vmauth $(MAKE) app-via-docker-386
package-vmauth:
APP_NAME=vmauth $(MAKE) package-via-docker
package-vmauth-pure:
APP_NAME=vmauth $(MAKE) package-via-docker-pure
package-vmauth-amd64:
APP_NAME=vmauth $(MAKE) package-via-docker-amd64
package-vmauth-arm:
APP_NAME=vmauth $(MAKE) package-via-docker-arm
package-vmauth-arm64:
APP_NAME=vmauth $(MAKE) package-via-docker-arm64
package-vmauth-ppc64le:
APP_NAME=vmauth $(MAKE) package-via-docker-ppc64le
package-vmauth-386:
APP_NAME=vmauth $(MAKE) package-via-docker-386
publish-vmauth:
APP_NAME=vmauth $(MAKE) publish-via-docker
run-vmauth:
APP_NAME=vmauth \
DOCKER_OPTS='-v $(shell pwd)/app/vmauth/:/app/vmauth' \
ARGS='-auth.config=app/vmauth/example_config.yml' \
$(MAKE) run-via-docker
vmauth-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmauth-local-with-goarch
vmauth-arm:
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmauth-local-with-goarch
vmauth-arm64:
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmauth-local-with-goarch
vmauth-ppc64le:
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmauth-local-with-goarch
vmauth-386:
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmauth-local-with-goarch
vmauth-local-with-goarch:
APP_NAME=vmauth $(MAKE) app-local-with-goarch
vmauth-pure:
APP_NAME=vmauth $(MAKE) app-local-pure

207
app/vmauth/README.md Normal file
View File

@@ -0,0 +1,207 @@
## vmauth
`vmauth` is a simple auth proxy and router for [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
It reads username and password from [Basic Auth headers](https://en.wikipedia.org/wiki/Basic_access_authentication),
matches them against configs pointed by `-auth.config` command-line flag and proxies incoming HTTP requests to the configured per-user `url_prefix` on successful match.
### Quick start
Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
and pass the following flag to `vmauth` binary in order to start authorizing and routing requests:
```
/path/to/vmauth -auth.config=/path/to/auth/config.yml
```
After that `vmauth` starts accepting HTTP requests on port `8427` and routing them according to the provided [-auth.config](#auth-config).
The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.
Feel free [contacting us](mailto:info@victoriametrics.com) if you need customized auth proxy for VictoriaMetrics with the support of LDAP, SSO, RBAC, SAML, accounting, limits, etc.
### Auth config
Auth config is represented in the following simple `yml` format:
```yml
# Arbitrary number of usernames may be put here.
# Usernames must be unique.
users:
# The user for querying local single-node VictoriaMetrics.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
- username: "cluster-select-account-123"
password: "***"
url_prefix: "http://vmselect:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
- username: "cluster-insert-account-42"
password: "***"
url_prefix: "http://vminsert:8480/insert/42/prometheus"
```
The config may contain `%{ENV_VAR}` placeholders, which are substituted by the corresponding `ENV_VAR` environment variable values.
This may be useful for passing secrets to the config.
### Security
Do not transfer Basic Auth headers in plaintext over untrusted networks. Enable https. This can be done by passing the following `-tls*` command-line flags to `vmauth`:
```
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
```
Alternatively, [https termination proxy](https://en.wikipedia.org/wiki/TLS_termination_proxy) may be put in front of `vmauth`.
### Monitoring
`vmauth` exports various metrics in Prometheus exposition format at `http://vmauth-host:8427/metrics` page. It is recommended setting up regular scraping of this page
either via [vmagent](https://victoriametrics.github.io/vmagent.html) or via Prometheus, so the exported metrics could be analyzed later.
### How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmauth` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmauth` from the root folder of the repository.
It builds `vmauth` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmauth-prod` from the root folder of the repository.
It builds `vmauth-prod` binary and puts it into the `bin` folder.
#### Building docker images
Run `make package-vmauth`. It builds `victoriametrics/vmauth:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmauth`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=scratch make package-vmauth
```
### Profiling
`vmauth` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):
* Memory profile. It can be collected with the following command:
```bash
curl -s http://<vmauth-host>:8427/debug/pprof/heap > mem.pprof
```
* CPU profile. It can be collected with the following command:
```bash
curl -s http://<vmauth-host>:8427/debug/pprof/profile > cpu.pprof
```
The command for collecting CPU profile waits for 30 seconds before returning.
The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof).
### Advanced usage
Pass `-help` command-line arg to `vmauth` in order to see all the configuration options:
```
./vmauth -help
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmauth.html .
-auth.config string
Path to auth config. See https://victoriametrics.github.io/vmauth.html for details on the format of this auth config
-enableTCP6
Whether to enable IPv6 for listening and dialing. By default only IPv4 TCP is used
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-http.connTimeout duration
Incoming http connections are closed after the configured timeout. This may help spreading incoming load among a cluster of services behind load balancer. Note that the real timeout may be bigger by up to 10% as a protection from Thundering herd problem (default 2m0s)
-http.disableResponseCompression
Disable compression of HTTP responses for saving CPU resources. By default compression is enabled to save network bandwidth
-http.idleConnTimeout duration
Timeout for incoming idle http connections (default 1m0s)
-http.maxGracefulShutdownDuration duration
The maximum duration for graceful shutdown of HTTP server. Highly loaded server may require increased value for graceful shutdown (default 7s)
-http.pathPrefix string
An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus
-http.shutdownDelay duration
Optional delay before http server shutdown. During this dealy the servier returns non-OK responses from /health page, so load balancers can route new requests to other servers
-httpAuth.password string
Password for HTTP Basic Auth. The authentication is disabled if -httpAuth.username is empty
-httpAuth.username string
Username for HTTP Basic Auth. The authentication is disabled if empty. See also -httpAuth.password
-httpListenAddr string
TCP address to listen for http connections (default ":8427")
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit (default 10)
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-metricsAuthKey string
Auth key for /metrics. It overrides httpAuth settings
-pprofAuthKey string
Auth key for /debug/pprof. It overrides httpAuth settings
-tls
Whether to enable TLS (aka HTTPS) for incoming requests. -tlsCertFile and -tlsKeyFile must be set if -tls is set
-tlsCertFile string
Path to file with TLS certificate. Used only if -tls is set. Prefer ECDSA certs instead of RSA certs, since RSA certs are slow
-tlsKeyFile string
Path to file with TLS key. Used only if -tls is set
-version
Show VictoriaMetrics version
```

131
app/vmauth/auth_config.go Normal file
View File

@@ -0,0 +1,131 @@
package main
import (
"flag"
"fmt"
"io/ioutil"
"net/url"
"strings"
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envtemplate"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/metrics"
"gopkg.in/yaml.v2"
)
var (
authConfigPath = flag.String("auth.config", "", "Path to auth config. See https://victoriametrics.github.io/vmauth.html "+
"for details on the format of this auth config")
)
// AuthConfig represents auth config.
type AuthConfig struct {
Users []UserInfo `yaml:"users"`
}
// UserInfo is user information read from authConfigPath
type UserInfo struct {
Username string `yaml:"username"`
Password string `yaml:"password"`
URLPrefix string `yaml:"url_prefix"`
requests *metrics.Counter
}
func initAuthConfig() {
if len(*authConfigPath) == 0 {
logger.Fatalf("missing required `-auth.config` command-line flag")
}
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Fatalf("cannot load auth config from `-auth.config=%s`: %s", *authConfigPath, err)
}
authConfig.Store(m)
stopCh = make(chan struct{})
authConfigWG.Add(1)
go func() {
defer authConfigWG.Done()
authConfigReloader()
}()
}
func stopAuthConfig() {
close(stopCh)
authConfigWG.Wait()
}
func authConfigReloader() {
sighupCh := procutil.NewSighupChan()
for {
select {
case <-stopCh:
return
case <-sighupCh:
logger.Infof("SIGHUP received; loading -auth.config=%q", *authConfigPath)
m, err := readAuthConfig(*authConfigPath)
if err != nil {
logger.Errorf("failed to load -auth.config=%q; using the last successfully loaded config; error: %s", *authConfigPath, err)
continue
}
authConfig.Store(m)
logger.Infof("Successfully reloaded -auth.config=%q", *authConfigPath)
}
}
}
var authConfig atomic.Value
var authConfigWG sync.WaitGroup
var stopCh chan struct{}
func readAuthConfig(path string) (map[string]*UserInfo, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("cannot read %q: %w", path, err)
}
m, err := parseAuthConfig(data)
if err != nil {
return nil, fmt.Errorf("cannot parse %q: %w", path, err)
}
logger.Infof("Loaded information about %d users from %q", len(m), path)
return m, nil
}
func parseAuthConfig(data []byte) (map[string]*UserInfo, error) {
data = envtemplate.Replace(data)
var ac AuthConfig
if err := yaml.UnmarshalStrict(data, &ac); err != nil {
return nil, fmt.Errorf("cannot unmarshal AuthConfig data: %w", err)
}
uis := ac.Users
if len(uis) == 0 {
return nil, fmt.Errorf("`users` section cannot be empty in AuthConfig")
}
m := make(map[string]*UserInfo, len(uis))
for i := range uis {
ui := &uis[i]
if m[ui.Username] != nil {
return nil, fmt.Errorf("duplicate username found; username: %q", ui.Username)
}
urlPrefix := ui.URLPrefix
// Remove trailing '/' from urlPrefix
for strings.HasSuffix(urlPrefix, "/") {
urlPrefix = urlPrefix[:len(urlPrefix)-1]
}
// Validate urlPrefix
target, err := url.Parse(urlPrefix)
if err != nil {
return nil, fmt.Errorf("invalid `url_prefix: %q`: %w", urlPrefix, err)
}
if target.Scheme != "http" && target.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme for `url_prefix: %q`: %q; must be `http` or `https`", urlPrefix, target.Scheme)
}
ui.URLPrefix = urlPrefix
ui.requests = metrics.GetOrCreateCounter(fmt.Sprintf(`vmauth_user_requests_total{username=%q}`, ui.Username))
m[ui.Username] = ui
}
return m, nil
}

View File

@@ -0,0 +1,112 @@
package main
import (
"reflect"
"testing"
)
func TestParseAuthConfigFailure(t *testing.T) {
f := func(s string) {
t.Helper()
_, err := parseAuthConfig([]byte(s))
if err == nil {
t.Fatalf("expecting non-nil error")
}
}
// Empty config
f(``)
// Invalid entry
f(`foobar`)
f(`foobar: baz`)
// Empty users
f(`users: []`)
// Missing url_prefix
f(`
users:
- username: foo
`)
// Invalid url_prefix
f(`
users:
- username: foo
url_prefix: bar
`)
f(`
users:
- username: foo
url_prefix: ftp://bar
`)
f(`
users:
- username: foo
url_prefix: //bar
`)
// Duplicate users
f(`
users:
- username: foo
url_prefix: http://foo.bar
- username: bar
url_prefix: http://xxx.yyy
- username: foo
url_prefix: https://sss.sss
`)
}
func TestParseAuthConfigSuccess(t *testing.T) {
f := func(s string, expectedAuthConfig map[string]*UserInfo) {
t.Helper()
m, err := parseAuthConfig([]byte(s))
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
removeMetrics(m)
if !reflect.DeepEqual(m, expectedAuthConfig) {
t.Fatalf("unexpected auth config\ngot\n%v\nwant\n%v", m, expectedAuthConfig)
}
}
// Single user
f(`
users:
- username: foo
password: bar
url_prefix: http://aaa:343/bbb
`, map[string]*UserInfo{
"foo": {
Username: "foo",
Password: "bar",
URLPrefix: "http://aaa:343/bbb",
},
})
// Multiple users
f(`
users:
- username: foo
url_prefix: http://foo
- username: bar
url_prefix: https://bar/x///
`, map[string]*UserInfo{
"foo": {
Username: "foo",
URLPrefix: "http://foo",
},
"bar": {
Username: "bar",
URLPrefix: "https://bar/x",
},
})
}
func removeMetrics(m map[string]*UserInfo) {
for _, info := range m {
info.requests = nil
}
}

View File

@@ -0,0 +1,8 @@
ARG base_image
FROM $base_image
EXPOSE 8427
ENTRYPOINT ["/vmauth-prod"]
ARG src_binary
COPY $src_binary ./vmauth-prod

View File

@@ -0,0 +1,31 @@
# Arbitrary number of usernames may be put here.
# Usernames must be unique.
users:
# The user for querying local single-node VictoriaMetrics.
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://localhost:8428 .
# For example, http://vmauth:8427/api/v1/query is routed to http://localhost:8428/api/v1/query
- username: "local-single-node"
password: "***"
url_prefix: "http://localhost:8428"
# The user for querying account 123 in VictoriaMetrics cluster
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the requests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vmselect:8481/select/123/prometheus .
# For example, http://vmauth:8427/api/v1/query is routed to http://vmselect:8481/select/123/prometheus/api/v1/select
- username: "cluster-select-account-123"
password: "***"
url_prefix: "http://vmselect:8481/select/123/prometheus"
# The user for inserting Prometheus data into VictoriaMetrics cluster under account 42
# See https://victoriametrics.github.io/Cluster-VictoriaMetrics.html#url-format
# All the reuqests to http://vmauth:8427 with the given Basic Auth (username:password)
# will be routed to http://vminsert:8480/insert/42/prometheus .
# For example, http://vmauth:8427/api/v1/write is routed to http://vminsert:8480/insert/42/prometheus/api/v1/write
- username: "cluster-insert-account-42"
password: "***"
url_prefix: "http://vminsert:8480/insert/42/prometheus"

102
app/vmauth/main.go Normal file
View File

@@ -0,0 +1,102 @@
package main
import (
"flag"
"net/http"
"net/http/httputil"
"net/url"
"os"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
var (
httpListenAddr = flag.String("httpListenAddr", ":8427", "TCP address to listen for http connections")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
logger.Infof("starting vmauth at %q...", *httpListenAddr)
startTime := time.Now()
initAuthConfig()
go httpserver.Serve(*httpListenAddr, requestHandler)
logger.Infof("started vmauth in %.3f seconds", time.Since(startTime).Seconds())
sig := procutil.WaitForSigterm()
logger.Infof("received signal %s", sig)
startTime = time.Now()
logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop the webservice: %s", err)
}
logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
stopAuthConfig()
logger.Infof("successfully stopped vmauth in %.3f seconds", time.Since(startTime).Seconds())
}
func requestHandler(w http.ResponseWriter, r *http.Request) bool {
username, password, ok := r.BasicAuth()
if !ok {
w.Header().Set("WWW-Authenticate", `Basic realm="Restricted"`)
http.Error(w, "missing `Authorization: Basic *` header", http.StatusUnauthorized)
return true
}
ac := authConfig.Load().(map[string]*UserInfo)
info := ac[username]
if info == nil || info.Password != password {
httpserver.Errorf(w, r, "cannot find the provided username %q or password in config", username)
return true
}
info.requests.Inc()
targetURL := createTargetURL(info.URLPrefix, r.URL)
if _, err := url.Parse(targetURL); err != nil {
httpserver.Errorf(w, r, "invalid targetURL=%q: %s", targetURL, err)
return true
}
r.Header.Set("vm-target-url", targetURL)
reverseProxy.ServeHTTP(w, r)
return true
}
var reverseProxy = &httputil.ReverseProxy{
Director: func(r *http.Request) {
targetURL := r.Header.Get("vm-target-url")
target, err := url.Parse(targetURL)
if err != nil {
logger.Panicf("BUG: unexpected error when parsing targetURL=%q: %s", targetURL, err)
}
r.URL = target
},
Transport: func() *http.Transport {
tr := http.DefaultTransport.(*http.Transport).Clone()
// Automatic compression must be disabled in order to fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/535
tr.DisableCompression = true
// Disable HTTP/2.0, since VictoriaMetrics components don't support HTTP/2.0 (because there is no sense in this).
tr.ForceAttemptHTTP2 = false
return tr
}(),
FlushInterval: time.Second,
ErrorLog: logger.StdErrorLogger(),
}
func usage() {
const s = `
vmauth authenticates and authorizes incoming requests and proxies them to VictoriaMetrics.
See the docs at https://victoriametrics.github.io/vmauth.html .
`
flagutil.Usage(s)
}

16
app/vmauth/target_url.go Normal file
View File

@@ -0,0 +1,16 @@
package main
import (
"net/url"
"path"
"strings"
)
func createTargetURL(prefix string, u *url.URL) string {
// Prevent from attacks with using `..` in r.URL.Path
u.Path = path.Clean(u.Path)
if !strings.HasPrefix(u.Path, "/") {
u.Path = "/" + u.Path
}
return prefix + u.RequestURI()
}

View File

@@ -0,0 +1,26 @@
package main
import (
"net/url"
"testing"
)
func TestCreateTargetURL(t *testing.T) {
f := func(prefix, requestURI, expectedTarget string) {
t.Helper()
u, err := url.Parse(requestURI)
if err != nil {
t.Fatalf("cannot parse %q: %s", requestURI, err)
}
target := createTargetURL(prefix, u)
if target != expectedTarget {
t.Fatalf("unexpected target; got %q; want %q", target, expectedTarget)
}
}
f("http://foo.bar", "", "http://foo.bar/.")
f("http://foo.bar", "/", "http://foo.bar/")
f("http://foo.bar", "a/b?c=d", "http://foo.bar/a/b?c=d")
f("https://sss:3894/x/y", "/z", "https://sss:3894/x/y/z")
f("https://sss:3894/x/y", "/../../aaa", "https://sss:3894/x/y/aaa")
f("https://sss:3894/x/y", "/./asd/../../aaa?a=d&s=s/../d", "https://sss:3894/x/y/aaa?a=d&s=s/../d")
}

73
app/vmbackup/Makefile Normal file
View File

@@ -0,0 +1,73 @@
# All these commands must run from repository root.
vmbackup:
APP_NAME=vmbackup $(MAKE) app-local
vmbackup-race:
APP_NAME=vmbackup RACE=-race $(MAKE) app-local
vmbackup-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker
vmbackup-pure-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-pure
vmbackup-amd64-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-amd64
vmbackup-arm-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-arm
vmbackup-arm64-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-arm64
vmbackup-ppc64le-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-ppc64le
vmbackup-386-prod:
APP_NAME=vmbackup $(MAKE) app-via-docker-386
package-vmbackup:
APP_NAME=vmbackup $(MAKE) package-via-docker
package-vmbackup-pure:
APP_NAME=vmbackup $(MAKE) package-via-docker-pure
package-vmbackup-amd64:
APP_NAME=vmbackup $(MAKE) package-via-docker-amd64
package-vmbackup-arm:
APP_NAME=vmbackup $(MAKE) package-via-docker-arm
package-vmbackup-arm64:
APP_NAME=vmbackup $(MAKE) package-via-docker-arm64
package-vmbackup-ppc64le:
APP_NAME=vmbackup $(MAKE) package-via-docker-ppc64le
package-vmbackup-386:
APP_NAME=vmbackup $(MAKE) package-via-docker-386
publish-vmbackup:
APP_NAME=vmbackup $(MAKE) publish-via-docker
vmbackup-amd64:
CGO_ENABLED=1 GOARCH=amd64 $(MAKE) vmbackup-local-with-goarch
vmbackup-arm:
CGO_ENABLED=0 GOARCH=arm $(MAKE) vmbackup-local-with-goarch
vmbackup-arm64:
CGO_ENABLED=0 GOARCH=arm64 $(MAKE) vmbackup-local-with-goarch
vmbackup-ppc64le:
CGO_ENABLED=0 GOARCH=ppc64le $(MAKE) vmbackup-local-with-goarch
vmbackup-386:
CGO_ENABLED=0 GOARCH=386 $(MAKE) vmbackup-local-with-goarch
vmbackup-local-with-goarch:
APP_NAME=vmbackup $(MAKE) app-local-with-goarch
vmbackup-pure:
APP_NAME=vmbackup $(MAKE) app-local-pure

253
app/vmbackup/README.md Normal file
View File

@@ -0,0 +1,253 @@
## vmbackup
`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
Supported storage systems for backups:
* [GCS](https://cloud.google.com/storage/). Example: `gcs://<bucket>/<path/to/backup>`
* [S3](https://aws.amazon.com/s3/). Example: `s3://<bucket>/<path/to/backup>`
* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio), [Ceph](https://docs.ceph.com/docs/mimic/radosgw/s3/) or [Swift](https://www.swiftstack.com/docs/admin/middleware/s3_middleware.html). See [these docs](#advanced-usage) for details.
* Local filesystem. Example: `fs://</absolute/path/to/backup>`
`vmbackup` supports incremental and full backups. Incremental backups created automatically if the destination path already contains data from the previous backup.
Full backups can be sped up with `-origin` pointing to already existing backup on the same remote storage. In this case `vmbackup` makes server-side copy for the shared
data between the existing backup and new backup. It saves time and costs on data transfer.
Backup process can be interrupted at any time. It is automatically resumed from the interruption point when restarting `vmbackup` with the same args.
Backed up data can be restored with [vmrestore](https://victoriametrics.github.io/vmrestore.html).
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
creation of hourly, daily, weekly and monthly backups.
### Use cases
#### Regular backups
Regular backup can be performed with the following command:
```
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/new/backup>
```
* `</path/to/victoria-metrics-data>` - path to VictoriaMetrics data pointed by `-storageDataPath` command-line flag in single-node VictoriaMetrics or in cluster `vmstorage`.
There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<local-snapshot>` is the snapshot to back up. See [how to create instant snapshots](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
* `<bucket>` is an already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
* `<path/to/new/backup>` is the destination path where new backup will be placed.
#### Regular backups with server-side copy from existing backup
If the destination GCS bucket already contains the previous backup at `-origin` path, then new backup can be sped up
with the following command:
```
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/new/backup> -origin=gcs://<bucket>/<path/to/existing/backup>
```
It saves time and network bandwidth costs by performing server-side copy for the shared data from the `-origin` to `-dst`.
#### Incremental backups
Incremental backups performed if `-dst` points to an already existing backup. In this case only new data uploaded to remote storage.
It saves time and network bandwidth costs when working with big backups:
```
vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/existing/backup>
```
#### Smart backups
Smart backups mean storing full daily backups into `YYYYMMDD` folders and creating incremental hourly backup into `latest` folder:
* Run the following command every hour:
```
vmbackup -snapshotName=<latest-snapshot> -dst=gcs://<bucket>/latest
```
Where `<latest-snapshot>` is the latest [snapshot](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots).
The command will upload only changed data to `gcs://<bucket>/latest`.
* Run the following command once a day:
```
vmbackup -snapshotName=<daily-snapshot> -dst=gcs://<bucket>/<YYYYMMDD> -origin=gcs://<bucket>/latest
```
Where `<daily-snapshot>` is the snapshot for the last day `<YYYYMMDD>`.
This apporach saves network bandwidth costs on hourly backups (since they are incremental) and allows recovering data from either the last hour (`latest` backup)
or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when creating daily backup.
Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
See also [vmbackuper tool](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for automating smart backups.
### How does it work?
The backup algorithm is the following:
1. Collect information about files in the `-snapshotName`, in the `-dst` and in the `-origin`.
2. Determine files in `-dst`, which are missing in `-snapshotName`, and delete them. These are usually small files, which are already merged into bigger files in the snapshot.
3. Determine files from `-snapshotName`, which are missing in `-dst`. These are usually small new files and bigger merged files.
4. Determine files from step 3, which exist in the `-origin`, and perform server-side copy of these files from `-origin` to `-dst`.
These are usually the biggest and the oldest files, which are shared between backups.
5. Upload the remaining files from step 3 from `-snapshotName` to `-dst`.
The algorithm splits source files into 100 MB chunks in the backup. Each chunk stored as a separate file in the backup.
Such splitting minimizes the amounts of data to re-transfer after temporary errors.
`vmbackup` relies on [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282) properties:
- All the files in the snapshot are immutable.
- Old files periodically merged into new files.
- Smaller files have higher probability to be merged.
- Consecutive snapshots share many identical files.
These properties allow performing fast and cheap incremental backups and server-side copying from `-origin` paths.
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
`vmbackup` can work improperly or slowly when these properties are violated.
### Troubleshooting
* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
* Backups created from [single-node VictoriaMetrics](https://victoriametrics.github.io/Single-server-VictoriaMetrics.html) cannot be restored
at [cluster VictoriaMetrics](https://victoriametrics.github.io/Cluster-VictoriaMetrics.html) and vice versa.
### Advanced usage
* Obtaining credentials from a file.
Add flag `-credsFilePath=/etc/credentials` with the following content:
for s3 (aws, minio or other s3 compatible storages):
```bash
[default]
aws_access_key_id=theaccesskey
aws_secret_access_key=thesecretaccesskeyvalue
```
for gce cloud storage:
```json
{
"type": "service_account",
"project_id": "project-id",
"private_key_id": "key-id",
"private_key": "-----BEGIN PRIVATE KEY-----\nprivate-key\n-----END PRIVATE KEY-----\n",
"client_email": "service-account-email",
"client_id": "client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/service-account-email"
}
```
* Usage with s3 custom url endpoint. It is possible to use `vmbackup` with s3 compatible storages like minio, cloudian, etc.
You have to add a custom url endpoint via flag:
```
# for minio
-customS3Endpoint=http://localhost:9000
# for aws gov region
-customS3Endpoint=https://s3-fips.us-gov-west-1.amazonaws.com
```
* Run `vmbackup -help` in order to see all the available options:
```
-concurrency int
The number of concurrent workers. Higher concurrency may reduce backup duration (default 10)
-configFilePath string
Path to file with S3 configs. Configs are loaded from default location if not set.
See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
-configProfile string
Profile name for S3 configs. If no set, the value of the environment variable will be loaded (AWS_PROFILE or AWS_DEFAULT_PROFILE), or if both not set, DefaultSharedConfigProfile is used
-credsFilePath string
Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
-customS3Endpoint string
Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
-dst string
Where to put the backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
-envflag.enable
Whether to enable reading flags from environment variables additionally to command line. Command line flag values have priority over values from environment vars. Flags are read only from command line if this flag isn't set
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread()
-loggerErrorsPerSecondLimit int
Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, then the remaining errors are suppressed. Zero value disables the rate limit (default 10)
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO")
-loggerOutput string
Output for the logs. Supported values: stderr, stdout (default "stderr")
-maxBytesPerSecond value
The maximum upload speed. There is no limit if it is set to 0
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedBytes value
Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to non-zero value. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage
Supports the following optional suffixes for values: KB, MB, GB, KiB, MiB, GiB (default 0)
-memory.allowedPercent float
Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low value may increase cache miss rate, which usually results in higher CPU and disk IO usage. Too high value may evict too much data from OS page cache, which will result in higher disk IO usage (default 60)
-origin string
Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
-snapshot.createURL string
VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. Example: http://victoriametrics:8428/snaphsot/create
-snapshot.deleteURL string
VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete
-snapshotName string
Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots
-storageDataPath string
Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
-version
Show VictoriaMetrics version
```
### How to build from sources
It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmbackup` from the root folder of the repository.
It builds `vmbackup` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmbackup-prod` from the root folder of the repository.
It builds `vmbackup-prod` binary and puts it into the `bin` folder.
#### Building docker images
Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` docker image locally.
`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
The base docker image is [alpine](https://hub.docker.com/_/alpine) but it is possible to use any other base image
by setting it via `<ROOT_IMAGE>` environment variable. For example, the following command builds the image on top of [scratch](https://hub.docker.com/_/scratch) image:
```bash
ROOT_IMAGE=scratch make package-vmbackup
```

View File

@@ -0,0 +1,6 @@
ARG base_image
FROM $base_image
ENTRYPOINT ["/vmbackup-prod"]
ARG src_binary
COPY $src_binary ./vmbackup-prod

154
app/vmbackup/main.go Normal file
View File

@@ -0,0 +1,154 @@
package main
import (
"flag"
"fmt"
"os"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmbackup/snapshot"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fsnil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
var (
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
snapshotName = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://victoriametrics.github.io/Single-server-VictoriaMetrics.html#how-to-work-with-snapshots")
snapshotCreateURL = flag.String("snapshot.createURL", "", "VictoriaMetrics create snapshot url. When this is given a snapshot will automatically be created during backup. "+
"Example: http://victoriametrics:8428/snaphsot/create")
snapshotDeleteURL = flag.String("snapshot.deleteURL", "", "VictoriaMetrics delete snapshot url. Optional. Will be generated from -snapshot.createURL if not provided. "+
"All created snaphosts will be automatically deleted. Example: http://victoriametrics:8428/snaphsot/delete")
dst = flag.String("dst", "", "Where to put the backup on the remote storage. "+
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
origin = flag.String("origin", "", "Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups")
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce backup duration")
maxBytesPerSecond = flagutil.NewBytes("maxBytesPerSecond", 0, "The maximum upload speed. There is no limit if it is set to 0")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()
logger.Init()
if len(*snapshotCreateURL) > 0 {
logger.Infof("Snapshots enabled")
logger.Infof("Snapshot create url %s", *snapshotCreateURL)
if len(*snapshotDeleteURL) <= 0 {
err := flag.Set("snapshot.deleteURL", strings.Replace(*snapshotCreateURL, "/create", "/delete", 1))
if err != nil {
logger.Fatalf("Failed to set snapshot.deleteURL flag: %v", err)
}
}
logger.Infof("Snapshot delete url %s", *snapshotDeleteURL)
name, err := snapshot.Create(*snapshotCreateURL)
if err != nil {
logger.Fatalf("cannot create snapshot: %s", err)
}
err = flag.Set("snapshotName", name)
if err != nil {
logger.Fatalf("cannot set snapshotName flag: %v", err)
}
defer func() {
err := snapshot.Delete(*snapshotDeleteURL, name)
if err != nil {
logger.Fatalf("cannot delete snapshot: %s", err)
}
}()
}
srcFS, err := newSrcFS()
if err != nil {
logger.Fatalf("%s", err)
}
dstFS, err := newDstFS()
if err != nil {
logger.Fatalf("%s", err)
}
originFS, err := newOriginFS()
if err != nil {
logger.Fatalf("%s", err)
}
a := &actions.Backup{
Concurrency: *concurrency,
Src: srcFS,
Dst: dstFS,
Origin: originFS,
}
if err := a.Run(); err != nil {
logger.Fatalf("cannot create backup: %s", err)
}
srcFS.MustStop()
dstFS.MustStop()
originFS.MustStop()
}
func usage() {
const s = `
vmbackup performs backups for VictoriaMetrics data from instant snapshots to gcs, s3
or local filesystem. Backed up data can be restored with vmrestore.
See the docs at https://victoriametrics.github.io/vbackup.html .
`
flagutil.Usage(s)
}
func newSrcFS() (*fslocal.FS, error) {
if len(*snapshotName) == 0 {
return nil, fmt.Errorf("`-snapshotName` or `-snapshot.createURL` must be provided")
}
snapshotPath := *storageDataPath + "/snapshots/" + *snapshotName
// Verify the snapshot exists.
f, err := os.Open(snapshotPath)
if err != nil {
return nil, fmt.Errorf("cannot open snapshot at %q: %w", snapshotPath, err)
}
fi, err := f.Stat()
_ = f.Close()
if err != nil {
return nil, fmt.Errorf("cannot stat %q: %w", snapshotPath, err)
}
if !fi.IsDir() {
return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
}
fs := &fslocal.FS{
Dir: snapshotPath,
MaxBytesPerSecond: maxBytesPerSecond.N,
}
if err := fs.Init(); err != nil {
return nil, fmt.Errorf("cannot initialize fs: %w", err)
}
return fs, nil
}
func newDstFS() (common.RemoteFS, error) {
fs, err := actions.NewRemoteFS(*dst)
if err != nil {
return nil, fmt.Errorf("cannot parse `-dst`=%q: %w", *dst, err)
}
return fs, nil
}
func newOriginFS() (common.OriginFS, error) {
if len(*origin) == 0 {
return &fsnil.FS{}, nil
}
fs, err := actions.NewRemoteFS(*origin)
if err != nil {
return nil, fmt.Errorf("cannot parse `-origin`=%q: %w", *origin, err)
}
return fs, nil
}

View File

@@ -0,0 +1,92 @@
package snapshot
import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type snapshot struct {
Status string `json:"status"`
Snapshot string `json:"snapshot"`
Msg string `json:"msg"`
}
// Create creates a snapshot and the provided api endpoint and returns
// the snapshot name
func Create(createSnapshotURL string) (string, error) {
logger.Infof("Creating snapshot")
u, err := url.Parse(createSnapshotURL)
if err != nil {
return "", err
}
resp, err := http.Get(u.String())
if err != nil {
return "", err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code returned from %q; expecting %d; got %d; response body: %q", createSnapshotURL, resp.StatusCode, http.StatusOK, body)
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return "", fmt.Errorf("cannot parse JSON response from %q: %w; response body: %q", createSnapshotURL, err, body)
}
if snap.Status == "ok" {
logger.Infof("Snapshot %s created", snap.Snapshot)
return snap.Snapshot, nil
} else if snap.Status == "error" {
return "", errors.New(snap.Msg)
} else {
return "", fmt.Errorf("Unkown status: %v", snap.Status)
}
}
// Delete deletes a snapshot and the provided api endpoint returns any failure
func Delete(deleteSnapshotURL string, snapshotName string) error {
logger.Infof("Deleting snapshot %s", snapshotName)
formData := url.Values{
"snapshot": {snapshotName},
}
u, err := url.Parse(deleteSnapshotURL)
if err != nil {
return err
}
resp, err := http.PostForm(u.String(), formData)
if err != nil {
return err
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status code returned from %q; expecting %d; got %d; response body: %q", deleteSnapshotURL, resp.StatusCode, http.StatusOK, body)
}
snap := snapshot{}
err = json.Unmarshal(body, &snap)
if err != nil {
return fmt.Errorf("cannot parse JSON response from %q: %w; response body: %q", deleteSnapshotURL, err, body)
}
if snap.Status == "ok" {
logger.Infof("Snapshot %s deleted", snapshotName)
return nil
} else if snap.Status == "error" {
return errors.New(snap.Msg)
} else {
return fmt.Errorf("Unkown status: %v", snap.Status)
}
}

Some files were not shown because too many files have changed in this diff Show More