Compare commits

...

85 Commits

Author SHA1 Message Date
Aliaksandr Valialkin
8905bc2a40 app/vmagent: check for error returned from flag.Set 2020-05-21 16:31:14 +03:00
Aliaksandr Valialkin
f9847352b4 app/vmagent: add -dryRun option for checking all the configs mentioned in command-line flags without running vmagent
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/362
2020-05-21 15:23:27 +03:00
Aliaksandr Valialkin
d1a9d8aa1c lib/promscrape: add -promscrape.config.dryRun flag for checking -promscrape.config for errors or unsupported options
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/508
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/362
2020-05-21 14:55:11 +03:00
Aliaksandr Valialkin
b93b01bc6d docs/vmagent.md: sync with app/vmagent/README.md 2020-05-21 12:11:15 +03:00
Aliaksandr Valialkin
dbd8beccfa app/vmselect/promql: add ascent_over_time(m[d]) and descent_over_time(m[d]) functions
These functions could be useful in GPS tracking apps for calculating the summary for height gain/loss
over the given duration `d`.
2020-05-21 12:07:48 +03:00
kreedom
6b23df2bec vmalert add quotes escape function (#510)
* vmalert add quotes escape function

Co-authored-by: kreedom
2020-05-20 22:20:31 +03:00
Aaron France
619d4959c7 Update README.md 2020-05-20 09:04:31 +03:00
Aliaksandr Valialkin
70ea4e28a7 app/vmselect/promql: update numbers after the upgrade of github.com/VictoriaMetrics/metrics from v1.11.2 to v1.11.3 2020-05-20 03:06:23 +03:00
Aliaksandr Valialkin
74a2943030 vendor: update github.com/VictoriaMetrics/metrics from v1.11.2 to v1.11.3 2020-05-20 02:55:11 +03:00
faceair
b3ec0fb5e2 keep debug symbols (#438)
Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
2020-05-20 01:21:46 +03:00
Roman Khavronenko
9a3afea123 dashboards: update troubleshooting row (#505)
* Slow metrics load panel was removed since it is hard to interpret without
additional metrics and stats;
* Slow inserts panel was updated to display percentage of slow inserts comparing
to total number of inserts to show the real impact.
2020-05-20 00:48:45 +03:00
Aliaksandr Valialkin
7705c19720 docs/MetricsQL.md: add a link to https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085 2020-05-19 23:20:13 +03:00
Aliaksandr Valialkin
80c18b7275 docs/vmagent.md: mention an alternative to refresh_interval option in scrape configs 2020-05-19 23:10:06 +03:00
Aliaksandr Valialkin
cf87b810b7 lib/promscrape: add -promscrape.discovery.concurrency and -promscrape.discovery.concurrentWaitTime flags for tuning the number of concurrent requests to autodiscovery API servers at Consul or Kubernetes 2020-05-19 17:35:53 +03:00
Aliaksandr Valialkin
538fdfe133 app/vmselect/promql: move common code from aggrFuncOutliersK and newAggrFuncRangeTopK into getRangeTopKTimeseries 2020-05-19 16:11:14 +03:00
Aliaksandr Valialkin
f52769f6ee app/vmselect/promql: fix outilersk calculations 2020-05-19 14:44:53 +03:00
Aliaksandr Valialkin
e6a782498a docs/Quick-Start.md: mention that vmagent can be used instead of Prometheus in most cases 2020-05-19 14:09:27 +03:00
Aliaksandr Valialkin
a441cdd1d9 app/vmselect/promql: add outliersk(N, m) aggregate function for anomaly detection across groups of similar time series 2020-05-19 13:52:36 +03:00
Aliaksandr Valialkin
d0f08b4a58 app/vmalert/notifier: go fmt 2020-05-19 12:59:46 +03:00
Roman Khavronenko
aae8064285 dashboards: updates and fixes (#499)
The new update introduces new row "Troubleshooting" that
contains panels for churn rate and slow-queries/inserts/loads metrics. This row supposed to be reveal the cause of low performance or other issues.

Panels for storage were updated with "bytes-per-datapoint" and "remaining disk size" panels.
2020-05-19 11:51:02 +03:00
kreedom
7e173655ba vmalert - add expr to variables, add escape functions (#495)
* vmalert - add expr to variables, add escape functions

Co-authored-by: kreedom
2020-05-18 11:55:16 +03:00
Roman Khavronenko
92212f04da vmalert: avoid sending resolves for pending alerts (#498)
Before the change we were sending notifications to notifier
if following conditions are met:
* alert is in Fire state
* alert is in Inactive state

We were sending Inactive notifications to resolve alert ASAP. 
Unfortunately, we were sending resolves for Pending alerts that become
Inactive, which is wrong.

In this change we delete alert from the active list if
it was Pending and become Inactive. In this way we now
have Inactive alerts only if they were in state Fire before.
See test change for example.
2020-05-17 15:13:22 +01:00
Roman Khavronenko
de60ad0cd6 vmalert: fix potential race during configuration reloads (#497)
Configuration reload and rules evaluation can't be executed
in same time now. This may make reload time longer but
prevents from potential races.
2020-05-17 15:12:09 +01:00
Aliaksandr Valialkin
7a8ef517ae docs/Articles.md: add https://www.robustperception.io/evaluating-performance-and-correctness to third-party posts 2020-05-17 00:35:44 +03:00
Aliaksandr Valialkin
d61bac9fd9 deployment/docker: update Go builder from v1.14.2 to v1.14.3
This should fix the following issues found in Go v1.14.2.
See https://github.com/golang/go/issues?q=milestone%3AGo1.14.3+label%3ACherryPickApproved for details.
2020-05-16 22:55:27 +03:00
Aliaksandr Valialkin
eac3da478e app/vmalert: run make quicktemplate-gen from the root dir of the repository 2020-05-16 22:46:02 +03:00
Aliaksandr Valialkin
0890c780c2 docs/Single-server-VictoriaMetrics.md: put contact us email to the top of the page 2020-05-16 22:36:59 +03:00
Aliaksandr Valialkin
fd1a6ce9ae docs/Single-server-VictoriaMetrics.md: add Replication and Backups sections 2020-05-16 22:27:48 +03:00
Aliaksandr Valialkin
9b90c841c6 docs/Cluster-VictoriaMetrics.md: add missing endpoints to the list: api/v1/import/csv and api/v1/status/tsdb 2020-05-16 22:13:25 +03:00
Aliaksandr Valialkin
93c87d28f6 all: print --help output to stdout instead of stderr
This is easier to grep and pipe
2020-05-16 11:59:33 +03:00
Aliaksandr Valialkin
23c55181ef docs/Quick-Start.md: update old link to Docker hub to new link 2020-05-16 10:23:08 +03:00
Aliaksandr Valialkin
b19ca3eb5f lib/storage: do not increment vm_slow_metric_name_loads_total counter for metric_ids which shouldnt be prefetched, since this may mislead users 2020-05-16 10:21:17 +03:00
Aliaksandr Valialkin
4e850cd6a7 lib/persistentqueue: a follow-up for https://github.com/VictoriaMetrics/VictoriaMetrics/pull/484 2020-05-16 09:31:46 +03:00
Aliaksandr Valialkin
8cb35974af app/vmrestore: document better that vmrestore works like rsync --delete, i.e. it deletes files in -storageDataPath, which are missing in the backup 2020-05-16 09:22:17 +03:00
肖贝贝
a0380a0a91 fix: fix vmagent multi queue may become one because sync bug (#484)
Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-05-16 09:19:52 +03:00
Aliaksandr Valialkin
3a68c47de0 app/vmagent/Makefile: fix make run-vmagent rule 2020-05-15 19:35:10 +03:00
Aliaksandr Valialkin
697b6af10f app/vmagent/remotewrite: remove unused import after the commit 93267f143f 2020-05-15 17:42:19 +03:00
Aliaksandr Valialkin
93267f143f app/vmagent/remotewrite: allow ingesting time series with multiple samples at once
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/481
2020-05-15 17:36:36 +03:00
Aliaksandr Valialkin
3412d5d138 lib/backup: remove misleading -dst mention in error message
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/482
2020-05-15 17:13:37 +03:00
Aliaksandr Valialkin
27f7cca7ff lib/backup: donload only the remaining parts for partially downloaded files after vmrestore restart
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/487
2020-05-15 17:03:33 +03:00
Aliaksandr Valialkin
cf38c3c62f .github/workflows: an attempt to fix loading of golangci-lint 2020-05-15 15:06:24 +03:00
Aliaksandr Valialkin
c56b66210f vendor: make vendor-update 2020-05-15 15:02:10 +03:00
Aliaksandr Valialkin
82ffbcb9a6 app/vmstorage: add vm_slow_metric_name_loads_total metric, which could be used as an indicator when more RAM is needed for improving query performance 2020-05-15 14:11:45 +03:00
Aliaksandr Valialkin
82ccdfaa91 app/vmstorage: add vm_slow_row_inserts_total and vm_slow_per_day_index_inserts_total metrics for determining whether VictoriaMetrics required more RAM for the current number of active time series 2020-05-15 13:44:32 +03:00
Aliaksandr Valialkin
ab8f5545bc docs/vmalert.md: sync with app/vmalert/README.md 2020-05-15 13:27:09 +03:00
Aliaksandr Valialkin
0eacea1de1 lib/{storage,mergeset}: further tuning of compression levels depending on block size
This should improve performance for querying newly added data, since it can be unpacked faster.
2020-05-15 13:24:37 +03:00
Aliaksandr Valialkin
737d641920 lib/storage: wait for all the goroutines to finish in TestSearch in order to prevent racy behavior on test finish 2020-05-15 13:24:37 +03:00
Aliaksandr Valialkin
4fc33163c4 lib/storage: optimize ingestion pefrormance for new time series 2020-05-15 13:24:37 +03:00
Aliaksandr Valialkin
f9f3afb6af lib/mergeset: tune compression levels in order to improve ingestion performance a bit 2020-05-15 13:24:37 +03:00
Aliaksandr Valialkin
8b32e7c3a0 lib/storage: reduce indentation in Storage.add 2020-05-15 13:24:37 +03:00
Aliaksandr Valialkin
1573ececb2 lib/storage: return the first error instead of the last error, since the first error usually points to the root cause 2020-05-15 13:24:37 +03:00
Roman Khavronenko
a249cd9d22 vmalert: fix the access to rules slice element by wrong index (#486)
During group's update rules deletion was causing slice
mutations while slice index was assumed to be unchanged.
This caused "slice bounds out of range" errors when multiple
rules were deleted sequentially.
2020-05-15 07:55:22 +01:00
hagen1778
ef0e37cb9e vmalert: update README 2020-05-15 09:17:28 +03:00
Aliaksandr Valialkin
0afd48d2ee lib: extract common code for returning fast unix timestamp into lib/fasttime 2020-05-14 23:02:07 +03:00
Aliaksandr Valialkin
42866fa754 lib/{storage,mergeset}: return dst on error from unmarshalBlockHeaders, so it could be reused 2020-05-14 15:32:07 +03:00
Aliaksandr Valialkin
827a3a7866 lib/storage: document that getnerateUniqueMetricID should return dense ids 2020-05-14 14:08:45 +03:00
Aliaksandr Valialkin
606585f7be lib/{storage,mergeset}: cleanup: remove unused partSearch.indexBlockReuse 2020-05-14 14:03:03 +03:00
Aliaksandr Valialkin
21598ac417 docs/vmalert.md: sync with app/vmalert/README.md 2020-05-13 22:55:35 +03:00
Aliaksandr Valialkin
894e5d2b9b docs/vmbackup.md: add a link to vmbackuper tool 2020-05-13 22:54:22 +03:00
Roman Khavronenko
415b1ddfb5 vmalert: check if remoteRead object was initied before calling Restore (#473)
The check for non-nil remoteRead was mistakenly dropped
during refactoring which caused panics when `vmalert`
wasn't configured with `remoteRead` flag.
2020-05-13 19:32:58 +01:00
Roman Khavronenko
db7dd96346 vmalert: fix flag names and description in README (#475)
Change also adds the recommendation for `remotewrite`
queue error.
2020-05-13 19:32:21 +01:00
肖贝贝
ba48438b06 Feat/vmalert add max queue size (#472)
* feat: add remoteWrite.maxQueueSize to reduce queue full
* rename remote(write|read) flags to remote(Write|Read) for the sake of consistency

Co-authored-by: xiaobeibei <xiaobeibei@bigo.sg>
2020-05-13 18:58:56 +01:00
Aliaksandr Valialkin
7882a0dbbf app/vmselect/promql: suppress "SA4006: this value of dstValues is never used" error in golangci-lint 2020-05-13 11:47:08 +03:00
Aliaksandr Valialkin
4fe67504f9 lib/storage: optimize label matching for regexp ending with literal suffix
For example, `{label=~"foo.*bar.+baz"}` contains literal suffix `baz`,
so it should work faster now.
2020-05-13 11:47:07 +03:00
Aliaksandr Valialkin
96e001d254 app/vmagent: fix a bug with improper relabeling when multiple -remoteWrite.urlRelableConfig args are set
This bug could result in incorrect relabeling and metrics' drop.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467
2020-05-12 22:02:58 +03:00
Aliaksandr Valialkin
faf92a0965 app/vmselect/promql: fix any(..) calculations - return all the data points instead of the first one 2020-05-12 20:36:42 +03:00
Aliaksandr Valialkin
a6f16dcc11 lib/fs: do not use mmap for 32-bit arches by default, since they cannot map files bigger than 4GB in RAM 2020-05-12 20:22:09 +03:00
Aliaksandr Valialkin
cc311e20fe app/vmselect/promql: add any(x) by (y) aggregate function, which returns any time series from q for each group y 2020-05-12 19:45:56 +03:00
Aliaksandr Valialkin
574289c3fb app/vmselect/promql: support for sum(x) by (y) limit N syntax in order to limit the number of output time series after aggregation 2020-05-12 19:45:54 +03:00
Aliaksandr Valialkin
0a134ace63 app/vmagent: fix scraping mTLS targets, which has been broken in v1.35.1
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/470
2020-05-12 17:23:03 +03:00
Aliaksandr Valialkin
8300cc17af app/vmagent,lib/promscrape: do not set HostClient.DialDualStack, since it isnt used if HostClient.Dial is set 2020-05-12 15:24:18 +03:00
Aliaksandr Valialkin
6273385618 app/vmagent/remotewrite: properly dial TCP6 addresses set via -remoteWrite.url
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/469
2020-05-12 15:22:29 +03:00
Aliaksandr Valialkin
3232605524 lib/storage: properly initialize part struct before trying to close it on error
This should prevent from nil pointer dereference bug at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/468 .
2020-05-12 14:54:31 +03:00
Aliaksandr Valialkin
18834a9191 vendor: make vendor-update 2020-05-12 14:25:35 +03:00
Aliaksandr Valialkin
d90dc1fbf9 deployment/docker: omit http2 support in *-prod binaries
VictoriaMetrics doesn't use http/2.0, so disable it completely.

Use `nethttpomithttp2` tag defined in Go1.14 for this.
See 2566e21f24 for details.
2020-05-12 14:19:43 +03:00
Aliaksandr Valialkin
dbd0c552d5 lib/storage: gradually pre-populate per-day inverted index for the next day
This should prevent from CPU usage spikes at 00:00 UTC every day when
inverted index for new day must be quickly created for all the active time series.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/430
2020-05-12 12:13:05 +03:00
Aliaksandr Valialkin
cc00a2c453 lib/storage: typo fixes in error messages: or -> of 2020-05-12 12:12:42 +03:00
Aliaksandr Valialkin
ce2107bc52 lib/storage: speed up matching for common regexps in label filters
The following regexps have been optimized:

* 'foo.+bar'
* 'foo.+bar.+baz'

This should improve performance for matching Graphite-like metrics.
2020-05-11 22:40:55 +03:00
Aliaksandr Valialkin
12a1a71cc1 lib/storage: add a benchmark for Graphite-like regexps for metric names 2020-05-11 22:37:32 +03:00
Aliaksandr Valialkin
de113806bb docs/CaseStudies.md: add CERN case study 2020-05-11 14:05:20 +03:00
Roman Khavronenko
8c8ff5d0cb vmalert: cleanup and restructure of code to improve maintainability (#471)
The change introduces new entity `manager` which replaces
`watchdog`, decouples requestHandler and groups. Manager
supposed to control life cycle of groups, rules and
config reloads.

Groups export an ID method which returns a hash
from filename and group name. ID supposed to be unique
identifier across all loaded groups.

Some tests were added to improve coverage.

Bug with wrong annotation value if $value is used in
 templates after metrics being restored fixed.

Notifier interface was extended to accept context.

New set of metrics was introduced for config reload.
2020-05-10 17:58:17 +01:00
Nikolay Khramchikhin
9e8733ff65 vmalert config reload
added config hot reload for vmalert with sighup and api call
2020-05-09 10:32:12 +01:00
Aliaksandr Valialkin
5bc5d6a1f2 docs/Single-server-VictoriaMetrics.md: small updates for Monitoring and How to start VictoriaMetrics sections 2020-05-08 20:34:50 +03:00
Aliaksandr Valialkin
baedb25936 docs/vmauth.md: fix a link to docker images 2020-05-08 14:10:04 +03:00
Aliaksandr Valialkin
6909d845b0 docs/Articles.md: add a link to CERN article at https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf 2020-05-08 01:25:37 +03:00
179 changed files with 9019 additions and 4407 deletions

View File

@@ -24,7 +24,7 @@ jobs:
run: |
go get -u golang.org/x/lint/golint
go get -u github.com/kisielk/errcheck
go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.27.0
- name: Code checkout
uses: actions/checkout@master
- name: Build

View File

@@ -10,16 +10,21 @@
## VictoriaMetrics
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
VictoriaMetrics is fast, cost-effective and scalable time-series database.
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
## Case studies and talks
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
@@ -33,6 +38,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
## Prominent features
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
See [these docs](#prometheus-setup) for details.
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
@@ -115,6 +122,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
* [Monitoring](#monitoring)
* [Troubleshooting](#troubleshooting)
* [Backfilling](#backfilling)
* [Replication](#replication)
* [Backups](#backups)
* [Profiling](#profiling)
* [Integrations](#integrations)
* [Third-party contributions](#third-party-contributions)
@@ -139,6 +148,8 @@ The following command-line flags are used the most:
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
Other flags have good enough default values, so set them only if you really need this.
Pass `-help` to see all the available flags with description and default values.
It is recommended setting up [monitoring](#monitoring) for VictoriaMetrics.
@@ -779,6 +790,8 @@ remote_write:
kill -HUP `pidof prometheus`
```
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
6) Set up Prometheus datasource in Grafana that points to Promxy.
@@ -789,6 +802,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
with the enabled de-duplication. See [this section](#deduplication) for details.
### Deduplication
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
@@ -890,7 +904,8 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
### Monitoring
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
These metrics may be collected via Prometheus by adding the corresponding scrape config to it.
These metrics may be collected by [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md)
or Prometheus by adding the corresponding scrape config to it.
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
@@ -901,13 +916,17 @@ The most interesting metrics are:
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
aka active time series.
* `rate(vm_new_timeseries_created_total[5m])` - time series churn rate.
* `vm_rows{type="indexdb"}` - the number of rows in inverted index. High value for this number usually mean high churn rate for time series.
* Sum of `vm_rows{type="storage/big"}` and `vm_rows{type="storage/small"}` - total number of `(timestamp, value)` data points
in the database.
* `vm_rows_inserted_total` - the total number of inserted rows since VictoriaMetrics start.
* `increase(vm_new_timeseries_created_total[1h])` - time series churn rate during the previous hour.
* `sum(vm_rows{type=~"storage/.*"})` - total number of `(timestamp, value)` data points in the database.
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total data size on disk.
* `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
### Troubleshooting
@@ -920,8 +939,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM.
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance.
ingestion and query performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
@@ -968,6 +988,24 @@ the query cache, which could contain incomplete data cached during the backfilli
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
for data with timestamps close to the current time.
### Replication
VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
See [backup docs](#backups) for details.
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
### Backups
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
### Profiling
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):

View File

@@ -3,6 +3,7 @@ package main
import (
"flag"
"net/http"
"os"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
@@ -25,6 +26,8 @@ var (
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
envflag.Parse()
buildinfo.Init()
logger.Init()

View File

@@ -52,8 +52,9 @@ publish-vmagent:
APP_NAME=vmagent $(MAKE) publish-via-docker
run-vmagent:
mkdir -p vmagent-data
DOCKER_OPTS='-v $(shell pwd)/vmagent-data:/vmagent-data' \
mkdir -p vmagent-remotewrite-data
DOCKER_OPTS='-v $(shell pwd)/vmagent-remotewrite-data:/vmagent-remotewrite-data' \
ARGS='-remoteWrite.url=http://localhost:8428/api/v1/write' \
APP_NAME=vmagent \
$(MAKE) run-via-docker

View File

@@ -1,8 +1,8 @@
## vmagent
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
<img alt="vmagent" src="vmagent.png">
@@ -11,7 +11,7 @@ or any other Prometheus-compatible storage system that supports `remote_write` p
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
Also, we found that users infrastructure is like snowflakes - never alike, and we decided to add more flexibility
Also, we found that users infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
@@ -31,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
### Quick Start
@@ -40,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
in order to replicate data concurrently to multiple remote storage systems.
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
Example command line:
@@ -49,7 +48,7 @@ Example command line:
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
If you need collecting only Influx data, then the following command line would be enough:
If you only need to collect Influx data, then the following is sufficient:
```
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
@@ -79,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
#### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
#### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
@@ -94,13 +93,13 @@ Then it sends the buffered data to the remote storage in order to prevent data g
#### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
#### Splitting data streams among multiple systems
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
@@ -148,6 +147,10 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
@@ -200,12 +203,12 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
### How to build from sources

View File

@@ -4,6 +4,7 @@ import (
"flag"
"fmt"
"net/http"
"os"
"strings"
"time"
@@ -39,6 +40,8 @@ var (
"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
"Usually :4242 must be set. Doesn't work if empty")
opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmagent. The following files are checked: "+
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig . See also -promscrape.config.dryRun")
)
var (
@@ -49,9 +52,26 @@ var (
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
envflag.Parse()
buildinfo.Init()
logger.Init()
if *dryRun {
if err := flag.Set("promscrape.config.strictParse", "true"); err != nil {
logger.Panicf("BUG: cannot set promscrape.config.strictParse=true: %s", err)
}
if err := remotewrite.CheckRelabelConfigs(); err != nil {
logger.Fatalf("error when checking relabel configs: %s", err)
}
if err := promscrape.CheckConfig(); err != nil {
logger.Fatalf("error when checking Prometheus config: %s", err)
}
logger.Infof("all the configs are ok; exitting with 0 status code")
return
}
logger.Infof("starting vmagent at %q...", *httpListenAddr)
startTime := time.Now()
remotewrite.Init()

View File

@@ -2,19 +2,17 @@ package remotewrite
import (
"crypto/tls"
"crypto/x509"
"encoding/base64"
"flag"
"fmt"
"io/ioutil"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
@@ -29,6 +27,8 @@ var (
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsCAFile = flagutil.NewArray("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
"By default system CA is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
tlsServerName = flagutil.NewArray("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to -remoteWrite.url. "+
"By default the server name from -remoteWrite.url is used. If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
basicAuthUsername = flagutil.NewArray("remoteWrite.basicAuth.username", "Optional basic auth username to use for -remoteWrite.url. "+
"If multiple args are set, then they are applied independently for the corresponding -remoteWrite.url")
@@ -113,7 +113,6 @@ func newClient(argIdx int, remoteWriteURL, urlLabelValue string, fq *persistentq
Addr: host,
Name: "vmagent",
Dial: statDial,
DialDualStack: netutil.TCP6Enabled(),
IsTLS: isTLS,
TLSConfig: tlsCfg,
MaxConns: maxConns,
@@ -154,37 +153,18 @@ func (c *client) MustStop() {
}
func getTLSConfig(argIdx int) (*tls.Config, error) {
var tlsRootCA *x509.CertPool
var tlsCertificate *tls.Certificate
certFile := tlsCertFile.GetOptionalArg(argIdx)
keyFile := tlsKeyFile.GetOptionalArg(argIdx)
if certFile != "" || keyFile != "" {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", certFile, keyFile, err)
}
tlsCertificate = &cert
tlsConfig := &promauth.TLSConfig{
CAFile: tlsCAFile.GetOptionalArg(argIdx),
CertFile: tlsCertFile.GetOptionalArg(argIdx),
KeyFile: tlsKeyFile.GetOptionalArg(argIdx),
ServerName: tlsServerName.GetOptionalArg(argIdx),
InsecureSkipVerify: *tlsInsecureSkipVerify,
}
if caFile := tlsCAFile.GetOptionalArg(argIdx); caFile != "" {
data, err := ioutil.ReadFile(caFile)
if err != nil {
return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", caFile, err)
}
tlsRootCA = x509.NewCertPool()
if !tlsRootCA.AppendCertsFromPEM(data) {
return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", caFile)
}
cfg, err := promauth.NewConfig(".", nil, "", "", tlsConfig)
if err != nil {
return nil, fmt.Errorf("cannot populate TLS config: %s", err)
}
tlsCfg := &tls.Config{
RootCAs: tlsRootCA,
ClientSessionCache: tls.NewLRUClientSessionCache(0),
}
if tlsCertificate != nil {
tlsCfg.GetClientCertificate = func(*tls.CertificateRequestInfo) (*tls.Certificate, error) {
return tlsCertificate, nil
}
}
tlsCfg.InsecureSkipVerify = *tlsInsecureSkipVerify
tlsCfg := cfg.NewTLSConfig()
return tlsCfg, nil
}

View File

@@ -6,7 +6,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
@@ -15,7 +15,8 @@ import (
var (
flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage. "+
"Minimum supported interval is 1 second")
maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
)
@@ -55,6 +56,10 @@ func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
}
func (ps *pendingSeries) periodicFlusher() {
flushSeconds := int64(flushInterval.Seconds())
if flushSeconds <= 0 {
flushSeconds = 1
}
ticker := time.NewTicker(*flushInterval)
defer ticker.Stop()
mustStop := false
@@ -63,7 +68,7 @@ func (ps *pendingSeries) periodicFlusher() {
case <-ps.stopCh:
mustStop = true
case <-ticker.C:
if time.Since(ps.wr.lastFlushTime) < *flushInterval/2 {
if fasttime.UnixTimestamp()-ps.wr.lastFlushTime < uint64(flushSeconds) {
continue
}
}
@@ -76,7 +81,7 @@ func (ps *pendingSeries) periodicFlusher() {
type writeRequest struct {
wr prompbmarshal.WriteRequest
pushBlock func(block []byte)
lastFlushTime time.Time
lastFlushTime uint64
tss []prompbmarshal.TimeSeries
@@ -108,7 +113,7 @@ func (wr *writeRequest) reset() {
func (wr *writeRequest) flush() {
wr.wr.Timeseries = wr.tss
wr.lastFlushTime = time.Now()
wr.lastFlushTime = fasttime.UnixTimestamp()
pushWriteRequest(&wr.wr, wr.pushBlock)
wr.reset()
}
@@ -144,13 +149,8 @@ func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
}
dst.Labels = labelsDst[labelsLen:]
samplesDst = append(samplesDst, prompbmarshal.Sample{})
dstSample := &samplesDst[len(samplesDst)-1]
if len(src.Samples) != 1 {
logger.Panicf("BUG: unexpected number of samples in time series; got %d; want 1", len(src.Samples))
}
*dstSample = src.Samples[0]
dst.Samples = samplesDst[len(samplesDst)-1:]
samplesDst = append(samplesDst, src.Samples...)
dst.Samples = samplesDst[len(samplesDst)-len(src.Samples):]
wr.samples = samplesDst
wr.labels = labelsDst

View File

@@ -32,6 +32,21 @@ var (
"Disk usage is unlimited if the value is set to 0")
)
// CheckRelabelConfigs checks -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig.
func CheckRelabelConfigs() error {
if *relabelConfigPathGlobal != "" {
if _, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal); err != nil {
return fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %s", *relabelConfigPathGlobal, err)
}
}
for _, path := range *relabelConfigPaths {
if _, err := promrelabel.LoadRelabelConfigs(path); err != nil {
return fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %s", path, err)
}
}
return nil
}
var rwctxs []*remoteWriteCtx
// Init initializes remotewrite.
@@ -86,7 +101,7 @@ func Stop() {
// Push sends wr to remote storage systems set via `-remoteWrite.url`.
//
// Each timeseries in wr.Timeseries must contain one sample.
// Note that wr may be modified by Push due to relabeling.
func Push(wr *prompbmarshal.WriteRequest) {
var rctx *relabelCtx
if len(prcsGlobal) > 0 || len(labelsGlobal) > 0 {
@@ -128,6 +143,8 @@ type remoteWriteCtx struct {
pss []*pendingSeries
pssNextIdx uint64
tss []prompbmarshal.TimeSeries
relabelMetricsDropped *metrics.Counter
}
@@ -181,6 +198,11 @@ func (rwctx *remoteWriteCtx) MustStop() {
func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
var rctx *relabelCtx
if len(rwctx.prcs) > 0 {
// Make a copy of tss before applying relabeling in order to prevent
// from affecting time series for other remoteWrite.url configs.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/467 for details.
rwctx.tss = append(rwctx.tss[:0], tss...)
tss = rwctx.tss
rctx = getRelabelCtx()
tssLen := len(tss)
tss = rctx.applyRelabeling(tss, nil, rwctx.prcs)
@@ -191,5 +213,7 @@ func (rwctx *remoteWriteCtx) Push(tss []prompbmarshal.TimeSeries) {
pss[idx].Push(tss)
if rctx != nil {
putRelabelCtx(rctx)
// Zero rwctx.tss in order to free up GC references.
rwctx.tss = prompbmarshal.ResetTimeSeries(rwctx.tss)
}
}

View File

@@ -4,12 +4,17 @@ import (
"net"
"sync/atomic"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
func statDial(addr string) (net.Conn, error) {
conn, err := fasthttp.Dial(addr)
func statDial(addr string) (conn net.Conn, err error) {
if netutil.TCP6Enabled() {
conn, err = fasthttp.DialDualStack(addr)
} else {
conn, err = fasthttp.Dial(addr)
}
dialsTotal.Inc()
if err != nil {
dialErrors.Inc()

View File

@@ -52,14 +52,16 @@ publish-vmalert:
APP_NAME=vmalert $(MAKE) publish-via-docker
test-vmalert:
go test -race -cover ./app/vmalert
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
go test -v -race -cover ./app/vmalert/datasource
go test -v -race -cover ./app/vmalert/notifier
run-vmalert: vmalert
./bin/vmalert -rule=app/vmalert/testdata/rules0-good.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 \
-remotewrite.url=http://localhost:8428 \
-remoteread.url=http://localhost:8428 \
-remoteWrite.url=http://localhost:8428 \
-remoteRead.url=http://localhost:8428 \
-evaluationInterval=3s
vmalert-amd64:

View File

@@ -13,7 +13,7 @@ sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
* Lightweight without extra dependencies.
### TODO:
* Configuration hot reload.
* Support recording rules.
### QuickStart
@@ -48,10 +48,11 @@ Rules in group evaluated one-by-one sequentially.
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
### Configuration
@@ -81,19 +82,21 @@ Usage of vmalert:
Address to listen for http connections (default ":8880")
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
-remoteread.basicAuth.password string
Optional basic auth password for -remoteread.url
-remoteread.basicAuth.username string
Optional basic auth username for -remoteread.url
-remoteread.lookback duration
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
-remoteRead.basicAuth.username string
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteread.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remotewrite.basicAuth.password string
Optional basic auth password for -remotewrite.url
-remotewrite.basicAuth.username string
Optional basic auth username for -remotewrite.url
-remotewrite.url string
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
-remoteWrite.maxQueueSize
Defines the max number of pending datapoints to remote write endpoint
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
Path to the file with alert rules.
@@ -109,8 +112,30 @@ Usage of vmalert:
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
To reload configuration without `vmalert` restart send SIGHUP signal
or send GET request to `/-/reload` endpoint.
### Contributing
`vmalert` is mostly designed and built by VictoriaMetrics community.
Feel free to share your experience and ideas for improving this
software. Please keep simplicity as the main priority.
### How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
- `vmalert` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert` from the root folder of the repository.
It builds `vmalert` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-prod` from the root folder of the repository.
It builds `vmalert-prod` binary and puts it into the `bin` folder.

View File

@@ -27,29 +27,33 @@ func Parse(pathPatterns []string, validateAnnotations bool) ([]Group, error) {
if err != nil {
return nil, fmt.Errorf("file %s: %w", file, err)
}
for _, group := range gr {
if _, ok := groupsNames[group.Name]; ok {
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", file, group.Name)
for _, g := range gr {
if _, ok := groupsNames[g.Name]; ok {
return nil, fmt.Errorf("one file can not contain groups with the same name %s, filepath:%s", g.Name, file)
}
groupsNames[group.Name] = struct{}{}
for _, rule := range group.Rules {
g.File = file
g.doneCh = make(chan struct{})
g.finishedCh = make(chan struct{})
g.updateCh = make(chan Group)
groupsNames[g.Name] = struct{}{}
for _, rule := range g.Rules {
if err = rule.Validate(); err != nil {
return nil, fmt.Errorf("invalid rule filepath:%s, group %s:%w", file, group.Name, err)
return nil, fmt.Errorf("invalid rule filepath: %s, group %s: %w", file, g.Name, err)
}
// TODO: this init looks weird here
rule.alerts = make(map[uint64]*notifier.Alert)
if validateAnnotations {
if err = notifier.ValidateTemplates(rule.Annotations); err != nil {
return nil, fmt.Errorf("invalid annotations filepath:%s, group %s:%w", file, group.Name, err)
return nil, fmt.Errorf("invalid annotations filepath: %s, group %s: %w", file, g.Name, err)
}
if err = notifier.ValidateTemplates(rule.Labels); err != nil {
return nil, fmt.Errorf("invalid labels filepath:%s, group %s:%w", file, group.Name, err)
return nil, fmt.Errorf("invalid labels filepath: %s, group %s: %w", file, g.Name, err)
}
}
rule.group = group
rule.group = g
rule.alerts = make(map[uint64]*notifier.Alert)
}
groups = append(groups, g)
}
groups = append(groups, gr...)
}
if len(groups) < 1 {
return nil, fmt.Errorf("no groups found in %s", strings.Join(pathPatterns, ";"))

197
app/vmalert/group.go Normal file
View File

@@ -0,0 +1,197 @@
package main
import (
"context"
"fmt"
"hash/fnv"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
)
// Group is an entity for grouping rules
type Group struct {
Name string
File string
Rules []*Rule
doneCh chan struct{}
finishedCh chan struct{}
// channel accepts new Group obj
// which supposed to update current group
updateCh chan Group
}
// ID return unique group ID that consists of
// rules file and group name
func (g *Group) ID() uint64 {
hash := fnv.New64a()
hash.Write([]byte(g.File))
hash.Write([]byte("\xff"))
hash.Write([]byte(g.Name))
return hash.Sum64()
}
// Restore restores alerts state for all group rules with For > 0
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
for _, rule := range g.Rules {
if rule.For == 0 {
return nil
}
if err := rule.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
}
}
return nil
}
// updateWith updates existing group with
// passed group object.
// Not thread-safe.
func (g *Group) updateWith(newGroup Group) {
rulesRegistry := make(map[string]*Rule)
for _, nr := range newGroup.Rules {
rulesRegistry[nr.id()] = nr
}
for i, or := range g.Rules {
nr, ok := rulesRegistry[or.id()]
if !ok {
// old rule is not present in the new list
// so we mark it for removing
g.Rules[i] = nil
continue
}
// copy all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Execs
or.For = nr.For
or.Expr = nr.Expr
or.Labels = nr.Labels
or.Annotations = nr.Annotations
delete(rulesRegistry, nr.id())
}
var newRules []*Rule
for _, r := range g.Rules {
if r == nil {
// skip nil rules
continue
}
newRules = append(newRules, r)
}
// add the rest of rules from registry
for _, nr := range rulesRegistry {
newRules = append(newRules, nr)
}
g.Rules = newRules
}
var (
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
)
func (g *Group) close() {
if g.doneCh == nil {
return
}
close(g.doneCh)
<-g.finishedCh
}
func (g *Group) start(ctx context.Context, interval time.Duration,
querier datasource.Querier, nr notifier.Notifier, rw *remotewrite.Client) {
logger.Infof("group %q started", g.Name)
t := time.NewTicker(interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
logger.Infof("group %q: context cancelled", g.Name)
close(g.finishedCh)
return
case <-g.doneCh:
logger.Infof("group %q: received stop signal", g.Name)
close(g.finishedCh)
return
case ng := <-g.updateCh:
g.updateWith(ng)
case <-t.C:
iterationTotal.Inc()
iterationStart := time.Now()
for _, rule := range g.Rules {
execTotal.Inc()
execStart := time.Now()
err := rule.Exec(ctx, querier)
execDuration.UpdateDuration(execStart)
if err != nil {
execErrors.Inc()
logger.Errorf("failed to execute rule %q.%q: %s", g.Name, rule.Name, err)
continue
}
var alertsToSend []notifier.Alert
for _, a := range rule.alerts {
switch a.State {
case notifier.StateFiring:
// set End to execStart + 3 intervals
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = execStart.Add(3 * interval)
alertsToSend = append(alertsToSend, *a)
pushToRW(rw, rule, a, execStart)
case notifier.StatePending:
pushToRW(rw, rule, a, execStart)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = execStart
alertsToSend = append(alertsToSend, *a)
}
}
if len(alertsToSend) == 0 {
continue
}
alertsSent.Add(len(alertsToSend))
if err := nr.Send(ctx, alertsToSend); err != nil {
alertsSendErrors.Inc()
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
}
}
iterationDuration.UpdateDuration(iterationStart)
}
}
}
func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
if rw == nil {
return
}
tss := rule.AlertToTimeSeries(a, timestamp)
remoteWriteSent.Add(len(tss))
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
}
}
}

217
app/vmalert/group_test.go Normal file
View File

@@ -0,0 +1,217 @@
package main
import (
"context"
"reflect"
"sort"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
)
func TestUpdateWith(t *testing.T) {
testCases := []struct {
name string
currentRules []*Rule
// rules must be sorted by name
newRules []*Rule
}{
{
"new rule",
[]*Rule{},
[]*Rule{{Name: "bar"}},
},
{
"update rule",
[]*Rule{{
Name: "foo",
Expr: "up > 0",
For: time.Second,
Labels: map[string]string{
"bar": "baz",
},
Annotations: map[string]string{
"summary": "{{ $value|humanize }}",
"description": "{{$labels}}",
},
}},
[]*Rule{{
Name: "bar",
Expr: "up > 10",
For: time.Second,
Labels: map[string]string{
"baz": "bar",
},
Annotations: map[string]string{
"summary": "none",
},
}},
},
{
"empty rule",
[]*Rule{{Name: "foo"}},
[]*Rule{},
},
{
"multiple rules",
[]*Rule{{Name: "bar"}, {Name: "baz"}, {Name: "foo"}},
[]*Rule{{Name: "baz"}, {Name: "foo"}},
},
{
"replace rule",
[]*Rule{{Name: "foo1"}},
[]*Rule{{Name: "foo2"}},
},
{
"replace multiple rules",
[]*Rule{{Name: "foo1"}, {Name: "foo2"}},
[]*Rule{{Name: "foo3"}, {Name: "foo4"}},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := &Group{Rules: tc.currentRules}
g.updateWith(Group{Rules: tc.newRules})
if len(g.Rules) != len(tc.newRules) {
t.Fatalf("expected to have %d rules; got: %d",
len(g.Rules), len(tc.newRules))
}
sort.Slice(g.Rules, func(i, j int) bool {
return g.Rules[i].Name < g.Rules[j].Name
})
for i, r := range g.Rules {
got, want := r, tc.newRules[i]
if got.Name != want.Name {
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
}
if got.Expr != want.Expr {
t.Fatalf("expected to have expression %q; got %q", want.Expr, got.Expr)
}
if got.For != want.For {
t.Fatalf("expected to have for %q; got %q", want.For, got.For)
}
if !reflect.DeepEqual(got.Annotations, want.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", want.Annotations, got.Annotations)
}
if !reflect.DeepEqual(got.Labels, want.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", want.Labels, got.Labels)
}
}
})
}
}
func TestGroupStart(t *testing.T) {
// TODO: make parsing from string instead of file
groups, err := Parse([]string{"testdata/rules1-good.rules"}, true)
if err != nil {
t.Fatalf("failed to parse rules: %s", err)
}
g := groups[0]
fn := &fakeNotifier{}
fs := &fakeQuerier{}
const inst1, inst2, job = "foo", "bar", "baz"
m1 := metricWithLabels(t, "instance", inst1, "job", job)
m2 := metricWithLabels(t, "instance", inst2, "job", job)
r := g.Rules[0]
alert1, err := r.newAlert(m1)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert1.State = notifier.StateFiring
alert1.ID = hash(m1)
alert2, err := r.newAlert(m2)
if err != nil {
t.Fatalf("faield to create alert: %s", err)
}
alert2.State = notifier.StateFiring
alert2.ID = hash(m2)
const evalInterval = time.Millisecond
finished := make(chan struct{})
fs.add(m1)
fs.add(m2)
go func() {
g.start(context.Background(), evalInterval, fs, fn, nil)
close(finished)
}()
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts := fn.getAlerts()
expectedAlerts := []notifier.Alert{*alert1, *alert2}
compareAlerts(t, expectedAlerts, gotAlerts)
// reset previous data
fs.reset()
// and set only one datapoint for response
fs.add(m1)
// wait for multiple evals
time.Sleep(20 * evalInterval)
gotAlerts = fn.getAlerts()
expectedAlerts = []notifier.Alert{*alert1}
compareAlerts(t, expectedAlerts, gotAlerts)
g.close()
<-finished
}
func compareAlerts(t *testing.T, as, bs []notifier.Alert) {
t.Helper()
if len(as) != len(bs) {
t.Fatalf("expected to have length %d; got %d", len(as), len(bs))
}
sort.Slice(as, func(i, j int) bool {
return as[i].ID < as[j].ID
})
sort.Slice(bs, func(i, j int) bool {
return bs[i].ID < bs[j].ID
})
for i := range as {
a, b := as[i], bs[i]
if a.Name != b.Name {
t.Fatalf("expected t have Name %q; got %q", a.Name, b.Name)
}
if a.State != b.State {
t.Fatalf("expected t have State %q; got %q", a.State, b.State)
}
if a.Value != b.Value {
t.Fatalf("expected t have Value %f; got %f", a.Value, b.Value)
}
if !reflect.DeepEqual(a.Annotations, b.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", a.Annotations, b.Annotations)
}
if !reflect.DeepEqual(a.Labels, b.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", a.Labels, b.Labels)
}
}
}
type fakeNotifier struct {
sync.Mutex
alerts []notifier.Alert
}
func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error {
fn.Lock()
defer fn.Unlock()
fn.alerts = alerts
return nil
}
func (fn *fakeNotifier) getAlerts() []notifier.Alert {
fn.Lock()
defer fn.Unlock()
return fn.alerts
}

View File

@@ -8,7 +8,6 @@ import (
"net/url"
"os"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
@@ -16,6 +15,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
@@ -38,17 +38,18 @@ absolute path to all .yaml files in root.`)
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username for -datasource.url")
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password for -datasource.url")
remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
remoteWriteURL = flag.String("remoteWrite.url", "", "Optional URL to Victoria Metrics or VMInsert where to persist alerts state"+
" in form of timeseries. E.g. http://127.0.0.1:8428")
remoteWriteUsername = flag.String("remotewrite.basicAuth.username", "", "Optional basic auth username for -remotewrite.url")
remoteWritePassword = flag.String("remotewrite.basicAuth.password", "", "Optional basic auth password for -remotewrite.url")
remoteWriteUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username for -remoteWrite.url")
remoteWritePassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password for -remoteWrite.url")
remoteWriteMaxQueueSize = flag.Int("remoteWrite.maxQueueSize", 10e3, "Defines the max number of pending datapoints to remote write endpoint")
remoteReadURL = flag.String("remoteread.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remotewrite.url` before and has been successfully persisted its state."+
remoteReadURL = flag.String("remoteRead.url", "", "Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts"+
" state. This configuration makes sense only if `vmalert` was configured with `remoteWrite.url` before and has been successfully persisted its state."+
" E.g. http://127.0.0.1:8428")
remoteReadUsername = flag.String("remoteread.basicAuth.username", "", "Optional basic auth username for -remoteread.url")
remoteReadPassword = flag.String("remoteread.basicAuth.password", "", "Optional basic auth password for -remoteread.url")
remoteReadLookBack = flag.Duration("remoteread.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
remoteReadUsername = flag.String("remoteRead.basicAuth.username", "", "Optional basic auth username for -remoteRead.url")
remoteReadPassword = flag.String("remoteRead.basicAuth.password", "", "Optional basic auth password for -remoteRead.url")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
evaluationInterval = flag.Duration("evaluationInterval", time.Minute, "How often to evaluate the rules. Default 1m")
@@ -56,8 +57,9 @@ absolute path to all .yaml files in root.`)
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
)
// TODO: hot configuration reload
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
envflag.Parse()
buildinfo.Init()
logger.Init()
@@ -65,26 +67,21 @@ func main() {
ctx, cancel := context.WithCancel(context.Background())
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
if err != nil {
logger.Fatalf("can not get external url:%s ", err)
logger.Fatalf("can not get external url: %s ", err)
}
notifier.InitTemplateFunc(eu)
logger.Infof("reading alert rules configuration file from %s", strings.Join(*rulePath, ";"))
groups, err := Parse(*rulePath, *validateTemplates)
if err != nil {
logger.Fatalf("cannot parse configuration file: %s", err)
}
w := &watchdog{
manager := &manager{
groups: make(map[uint64]*Group),
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
alertProvider: notifier.NewAlertManager(*notifierURL, func(group, name string) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, name)
notifier: notifier.NewAlertManager(*notifierURL, func(group, alert string) string {
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, alert)
}, &http.Client{}),
}
if *remoteWriteURL != "" {
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
Addr: *remoteWriteURL,
MaxQueueSize: *remoteWriteMaxQueueSize,
FlushInterval: *evaluationInterval,
BasicAuthUser: *remoteWriteUsername,
BasicAuthPass: *remoteWritePassword,
@@ -92,30 +89,39 @@ func main() {
if err != nil {
logger.Fatalf("failed to init remotewrite client: %s", err)
}
w.rw = c
manager.rw = c
}
var restoreDS *datasource.VMStorage
if *remoteReadURL != "" {
restoreDS = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
manager.rr = datasource.NewVMStorage(*remoteReadURL, *remoteReadUsername, *remoteReadPassword, &http.Client{})
}
wg := sync.WaitGroup{}
for _, g := range groups {
if restoreDS != nil {
err := g.Restore(ctx, restoreDS, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", g.Name, err)
if err := manager.start(ctx, *rulePath, *validateTemplates); err != nil {
logger.Fatalf("failed to start: %s", err)
}
go func() {
// init reload metrics with positive values to improve alerting conditions
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
sigHup := procutil.NewSighupChan()
for {
<-sigHup
configReloads.Inc()
logger.Infof("SIGHUP received. Going to reload rules %q ...", *rulePath)
if err := manager.update(ctx, *rulePath, *validateTemplates, false); err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("error while reloading rules: %s", err)
continue
}
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Rules reloaded successfully from %q", *rulePath)
}
wg.Add(1)
go func(group Group) {
w.run(ctx, group, *evaluationInterval)
wg.Done()
}(g)
}
}()
go httpserver.Serve(*httpListenAddr, (&requestHandler{groups: groups}).handler)
rh := &requestHandler{m: manager}
go httpserver.Serve(*httpListenAddr, rh.handler)
sig := procutil.WaitForSigterm()
logger.Infof("service received signal %s", sig)
@@ -123,91 +129,16 @@ func main() {
logger.Fatalf("cannot stop the webservice: %s", err)
}
cancel()
if w.rw != nil {
err := w.rw.Close()
if err != nil {
logger.Fatalf("cannot stop the remotewrite: %s", err)
}
}
wg.Wait()
}
type watchdog struct {
storage *datasource.VMStorage
alertProvider notifier.Notifier
rw *remotewrite.Client
manager.close()
}
var (
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
execTotal = metrics.NewCounter(`vmalert_execution_total`)
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
configReloads = metrics.NewCounter(`vmalert_config_last_reload_total`)
configReloadErrors = metrics.NewCounter(`vmalert_config_last_reload_errors_total`)
configSuccess = metrics.NewCounter(`vmalert_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vmalert_config_last_reload_success_timestamp_seconds`)
)
func (w *watchdog) run(ctx context.Context, group Group, evaluationInterval time.Duration) {
logger.Infof("watchdog for %s has been started", group.Name)
t := time.NewTicker(evaluationInterval)
defer t.Stop()
for {
select {
case <-t.C:
iterationTotal.Inc()
iterationStart := time.Now()
for _, rule := range group.Rules {
execTotal.Inc()
execStart := time.Now()
err := rule.Exec(ctx, w.storage)
execDuration.UpdateDuration(execStart)
if err != nil {
execErrors.Inc()
logger.Errorf("failed to execute rule %q.%q: %s", group.Name, rule.Name, err)
continue
}
var alertsToSend []notifier.Alert
for _, a := range rule.alerts {
if a.State != notifier.StatePending {
alertsToSend = append(alertsToSend, *a)
}
if a.State == notifier.StateInactive || w.rw == nil {
continue
}
tss := rule.AlertToTimeSeries(a, execStart)
for _, ts := range tss {
remoteWriteSent.Inc()
if err := w.rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
}
}
}
alertsSent.Add(len(alertsToSend))
if err := w.alertProvider.Send(alertsToSend); err != nil {
alertsSendErrors.Inc()
logger.Errorf("failed to send alert for rule %q.%q: %s", group.Name, rule.Name, err)
}
}
iterationDuration.UpdateDuration(iterationStart)
case <-ctx.Done():
logger.Infof("%s received stop signal", group.Name)
return
}
}
}
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
if externalURL != "" {
return url.Parse(externalURL)

108
app/vmalert/manager.go Normal file
View File

@@ -0,0 +1,108 @@
package main
import (
"context"
"fmt"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type manager struct {
storage datasource.Querier
notifier notifier.Notifier
rw *remotewrite.Client
rr datasource.Querier
wg sync.WaitGroup
groupsMu sync.RWMutex
groups map[uint64]*Group
}
// AlertAPI generates APIAlert object from alert by its id(hash)
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
m.groupsMu.RLock()
defer m.groupsMu.RUnlock()
g, ok := m.groups[gID]
if !ok {
return nil, fmt.Errorf("can't find group with id %q", gID)
}
for _, rule := range g.Rules {
if apiAlert := rule.AlertAPI(aID); apiAlert != nil {
return apiAlert, nil
}
}
return nil, fmt.Errorf("can't func alert with id %q in group %q", aID, g.Name)
}
func (m *manager) start(ctx context.Context, path []string, validate bool) error {
return m.update(ctx, path, validate, true)
}
func (m *manager) close() {
if m.rw != nil {
err := m.rw.Close()
if err != nil {
logger.Fatalf("cannot stop the remotewrite: %s", err)
}
}
m.wg.Wait()
}
func (m *manager) startGroup(ctx context.Context, group Group, restore bool) {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
}
m.wg.Add(1)
id := group.ID()
go func() {
group.start(ctx, *evaluationInterval, m.storage, m.notifier, m.rw)
m.wg.Done()
}()
m.groups[id] = &group
}
func (m *manager) update(ctx context.Context, path []string, validate, restore bool) error {
logger.Infof("reading alert rules configuration file from %q", strings.Join(path, ";"))
newGroups, err := Parse(path, validate)
if err != nil {
return fmt.Errorf("cannot parse configuration file: %s", err)
}
groupsRegistry := make(map[uint64]Group)
for _, ng := range newGroups {
groupsRegistry[ng.ID()] = ng
}
m.groupsMu.Lock()
for _, og := range m.groups {
ng, ok := groupsRegistry[og.ID()]
if !ok {
// old group is not present in new list
// and must be stopped and deleted
og.close()
delete(m.groups, og.ID())
og = nil
continue
}
og.updateCh <- ng
delete(groupsRegistry, ng.ID())
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
}
m.groupsMu.Unlock()
return nil
}

163
app/vmalert/manager_test.go Normal file
View File

@@ -0,0 +1,163 @@
package main
import (
"context"
"math/rand"
"strings"
"sync"
"testing"
"time"
)
func TestManagerUpdateError(t *testing.T) {
m := &manager{groups: make(map[uint64]*Group)}
path := []string{"foo/bar"}
err := m.update(context.Background(), path, true, false)
if err == nil {
t.Fatalf("expected to have err; got nil instead")
}
expErr := "no groups found"
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to got err %s; got %s", expErr, err)
}
}
// TestManagerUpdateConcurrent supposed to test concurrent
// execution of configuration update.
// Should be executed with -race flag
func TestManagerUpdateConcurrent(t *testing.T) {
m := &manager{
groups: make(map[uint64]*Group),
storage: &fakeQuerier{},
notifier: &fakeNotifier{},
}
paths := []string{
"testdata/dir/rules0-good.rules",
"testdata/dir/rules1-good.rules",
"testdata/rules0-good.rules",
}
*evaluationInterval = time.Millisecond
if err := m.start(context.Background(), []string{paths[0]}, true); err != nil {
t.Fatalf("failed to start: %s", err)
}
const workers = 500
const iterations = 10
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
defer wg.Done()
for i := 0; i < iterations; i++ {
rnd := rand.Intn(len(paths))
path := []string{paths[rnd]}
err := m.update(context.Background(), path, true, false)
if err != nil {
t.Errorf("update error: %s", err)
}
}
}()
}
wg.Wait()
}
// TestManagerUpdate tests sequential configuration
// updates.
func TestManagerUpdate(t *testing.T) {
testCases := []struct {
name string
initPath string
updatePath string
want []*Group
}{
{
name: "update good rules",
initPath: "testdata/rules0-good.rules",
updatePath: "testdata/dir/rules1-good.rules",
want: []*Group{
{
File: "testdata/dir/rules1-good.rules",
Name: "duplicatedGroupDiffFiles",
Rules: []*Rule{newTestRule("VMRows", time.Second*10)},
},
},
},
{
name: "update good rules from 1 to 2 groups",
initPath: "testdata/dir/rules1-good.rules",
updatePath: "testdata/rules0-good.rules",
want: []*Group{
{
File: "testdata/rules0-good.rules",
Name: "groupGorSingleAlert", Rules: []*Rule{
newTestRule("VMRows", time.Second*10),
}},
{
File: "testdata/rules0-good.rules",
Name: "TestGroup", Rules: []*Rule{
newTestRule("Conns", time.Duration(0)),
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
}},
},
},
{
name: "update with one bad rule file",
initPath: "testdata/rules0-good.rules",
updatePath: "testdata/dir/rules2-bad.rules",
want: []*Group{
{
File: "testdata/rules0-good.rules",
Name: "groupGorSingleAlert", Rules: []*Rule{
newTestRule("VMRows", time.Second*10),
}},
{
File: "testdata/rules0-good.rules",
Name: "TestGroup", Rules: []*Rule{
newTestRule("Conns", time.Duration(0)),
newTestRule("ExampleAlertAlwaysFiring", time.Duration(0)),
}},
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.TODO())
m := &manager{groups: make(map[uint64]*Group), storage: &fakeQuerier{}}
path := []string{tc.initPath}
if err := m.update(ctx, path, true, false); err != nil {
t.Fatalf("failed to complete initial rules update: %s", err)
}
path = []string{tc.updatePath}
_ = m.update(ctx, path, true, false)
if len(tc.want) != len(m.groups) {
t.Fatalf("\nwant number of groups: %d;\ngot: %d ", len(tc.want), len(m.groups))
}
for _, wantG := range tc.want {
gotG, ok := m.groups[wantG.ID()]
if !ok {
t.Fatalf("expected to have group %q", wantG.Name)
}
compareGroups(t, gotG, wantG)
}
cancel()
m.close()
})
}
}
func compareGroups(t *testing.T, a, b *Group) {
t.Helper()
if len(a.Rules) != len(b.Rules) {
t.Fatalf("expected group %s to have %d rules; got: %d",
a.Name, len(a.Rules), len(b.Rules))
}
for i, r := range a.Rules {
got, want := r, b.Rules[i]
if got.Name != want.Name {
t.Fatalf("expected to have rule %q; got %q", want.Name, got.Name)
}
}
}

View File

@@ -12,12 +12,13 @@ import (
// Alert the triggered alert
// TODO: Looks like alert name isn't unique
type Alert struct {
Group string
GroupID uint64
Name string
Labels map[string]string
Annotations map[string]string
State AlertState
Expr string
Start time.Time
End time.Time
Value float64
@@ -52,14 +53,15 @@ func (as AlertState) String() string {
type alertTplData struct {
Labels map[string]string
Value float64
Expr string
}
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}`
const tplHeader = `{{ $value := .Value }}{{ $labels := .Labels }}{{ $expr := .Expr }}`
// ExecTemplate executes the Alert template for give
// map of annotations.
func (a *Alert) ExecTemplate(annotations map[string]string) (map[string]string, error) {
tplData := alertTplData{Value: a.Value, Labels: a.Labels}
tplData := alertTplData{Value: a.Value, Labels: a.Labels, Expr: a.Expr}
return templateAnnotations(annotations, tplHeader, tplData)
}

View File

@@ -1,22 +1,27 @@
package notifier
import (
"fmt"
"net/url"
"testing"
)
func TestAlert_ExecTemplate(t *testing.T) {
u, _ := url.Parse("https://victoriametrics.com/path")
InitTemplateFunc(u)
testCases := []struct {
name string
alert *Alert
annotations map[string]string
expTpl map[string]string
}{
{
name: "empty-alert",
alert: &Alert{},
annotations: map[string]string{},
expTpl: map[string]string{},
},
{
name: "no-template",
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
@@ -27,6 +32,7 @@ func TestAlert_ExecTemplate(t *testing.T) {
expTpl: map[string]string{},
},
{
name: "label-template",
alert: &Alert{
Value: 1e4,
Labels: map[string]string{
@@ -43,10 +49,24 @@ func TestAlert_ExecTemplate(t *testing.T) {
"description": "It is 10000 connections for localhost",
},
},
{
name: "expression-template",
alert: &Alert{
Expr: `vm_rows{"label"="bar"}>0`,
},
annotations: map[string]string{
"exprEscapedQuery": "{{ $expr|quotesEscape|queryEscape }}",
"exprEscapedPath": "{{ $expr|quotesEscape|pathEscape }}",
},
expTpl: map[string]string{
"exprEscapedQuery": "vm_rows%7B%5C%22label%5C%22%3D%5C%22bar%5C%22%7D%3E0",
"exprEscapedPath": "vm_rows%7B%5C%22label%5C%22=%5C%22bar%5C%22%7D%3E0",
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tpl, err := tc.alert.ExecTemplate(tc.annotations)
if err != nil {
t.Fatal(err)

View File

@@ -2,6 +2,7 @@ package notifier
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
@@ -17,13 +18,21 @@ type AlertManager struct {
}
// Send an alert or resolve message
func (am *AlertManager) Send(alerts []Alert) error {
func (am *AlertManager) Send(ctx context.Context, alerts []Alert) error {
b := &bytes.Buffer{}
writeamRequest(b, alerts, am.argFunc)
resp, err := am.client.Post(am.alertURL, "application/json", b)
req, err := http.NewRequest("POST", am.alertURL, b)
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req = req.WithContext(ctx)
resp, err := am.client.Do(req)
if err != nil {
return err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
@@ -37,7 +46,7 @@ func (am *AlertManager) Send(alerts []Alert) error {
}
// AlertURLGenerator returns URL to single alert by given name
type AlertURLGenerator func(group, id string) string
type AlertURLGenerator func(group, alert string) string
const alertManagerPath = "/api/v2/alerts"

View File

@@ -9,7 +9,7 @@
{% for i, alert := range alerts %}
{
"startsAt":{%q= alert.Start.Format(time.RFC3339Nano) %},
"generatorURL": {%q= generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)) %},
"generatorURL": {%q= generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)) %},
{% if !alert.End.IsZero() %}
"endsAt":{%q= alert.End.Format(time.RFC3339Nano) %},
{% endif %}

View File

@@ -36,7 +36,7 @@ func streamamRequest(qw422016 *qt422016.Writer, alerts []Alert, generatorURL fun
//line app/vmalert/notifier/alertmanager_request.qtpl:11
qw422016.N().S(`,"generatorURL":`)
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().Q(generatorURL(alert.Group, strconv.FormatUint(alert.ID, 10)))
qw422016.N().Q(generatorURL(strconv.FormatUint(alert.GroupID, 10), strconv.FormatUint(alert.ID, 10)))
//line app/vmalert/notifier/alertmanager_request.qtpl:12
qw422016.N().S(`,`)
//line app/vmalert/notifier/alertmanager_request.qtpl:13

View File

@@ -1,6 +1,7 @@
package notifier
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
@@ -40,8 +41,8 @@ func TestAlertManager_Send(t *testing.T) {
if len(a) != 1 {
t.Errorf("expected 1 alert in array got %d", len(a))
}
if a[0].GeneratorURL != "group0" {
t.Errorf("exptected alert0 as generatorURL got %s", a[0].GeneratorURL)
if a[0].GeneratorURL != "0/0" {
t.Errorf("exptected 0/0 as generatorURL got %s", a[0].GeneratorURL)
}
if a[0].Labels["alertname"] != "alert0" {
t.Errorf("exptected alert0 as alert name got %s", a[0].Labels["alertname"])
@@ -57,16 +58,16 @@ func TestAlertManager_Send(t *testing.T) {
srv := httptest.NewServer(mux)
defer srv.Close()
am := NewAlertManager(srv.URL, func(group, name string) string {
return group + name
return group + "/" + name
}, srv.Client())
if err := am.Send([]Alert{{}, {}}); err == nil {
if err := am.Send(context.Background(), []Alert{{}, {}}); err == nil {
t.Error("expected connection error got nil")
}
if err := am.Send([]Alert{}); err == nil {
if err := am.Send(context.Background(), []Alert{}); err == nil {
t.Error("expected wrong http code error got nil")
}
if err := am.Send([]Alert{{
Group: "group",
if err := am.Send(context.Background(), []Alert{{
GroupID: 0,
Name: "alert0",
Start: time.Now().UTC(),
End: time.Now().UTC(),

View File

@@ -1,6 +1,8 @@
package notifier
import "context"
// Notifier is common interface for alert manager provider
type Notifier interface {
Send(alerts []Alert) error
Send(ctx context.Context, alerts []Alert) error
}

View File

@@ -142,6 +142,15 @@ func InitTemplateFunc(externalURL *url.URL) {
"externalURL": func() string {
return externalURL.String()
},
"pathEscape": func(u string) string {
return url.PathEscape(u)
},
"queryEscape": func(q string) string {
return url.QueryEscape(q)
},
"quotesEscape": func(q string) string {
return strings.Replace(q, `"`, `\"`, -1)
},
}
}

View File

@@ -103,7 +103,8 @@ func (c *Client) Push(s prompbmarshal.TimeSeries) error {
case c.input <- s:
return nil
default:
return fmt.Errorf("failed to push timeseries - queue is full (%d entries)",
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
c.maxQueueSize)
}
}

View File

@@ -17,25 +17,6 @@ import (
"github.com/VictoriaMetrics/metricsql"
)
// Group grouping array of alert
type Group struct {
Name string
Rules []*Rule
}
// Restore restores alerts state for all group rules with For > 0
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
for _, rule := range g.Rules {
if rule.For == 0 {
return nil
}
if err := rule.Restore(ctx, q, lookback); err != nil {
return fmt.Errorf("error while restoring rule %q: %s", rule.Name, err)
}
}
return nil
}
// Rule is basic alert entity
type Rule struct {
Name string `yaml:"alert"`
@@ -58,6 +39,10 @@ type Rule struct {
lastExecError error
}
func (r *Rule) id() string {
return r.Name
}
// Validate validates rule
func (r *Rule) Validate() error {
if r.Name == "" {
@@ -86,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
}
for h, a := range r.alerts {
// cleanup inactive alerts from previous Eval
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive {
delete(r.alerts, h)
}
@@ -98,8 +83,16 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
h := hash(m)
updated[h] = struct{}{}
if a, ok := r.alerts[h]; ok {
// update Value field with latest value
a.Value = m.Value
if a.Value != m.Value {
// update Value field with latest value
a.Value = m.Value
// and re-exec template since Value can be used
// in templates
err = r.template(a)
if err != nil {
return err
}
}
continue
}
a, err := r.newAlert(m)
@@ -116,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending {
// alert was in Pending state - it is not
// active anymore
delete(r.alerts, h)
continue
}
a.State = notifier.StateInactive
// set endTime to last execution time
// so it can be sent by notifier on next step
a.End = r.lastExecTime
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
if a.State == notifier.StateFiring {
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
}
}
return nil
}
@@ -154,15 +147,14 @@ func hash(m datasource.Metric) uint64 {
func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
a := &notifier.Alert{
Group: r.group.Name,
Name: r.Name,
Labels: map[string]string{},
Value: m.Value,
Start: time.Now(),
GroupID: r.group.ID(),
Name: r.Name,
Labels: map[string]string{},
Value: m.Value,
Start: time.Now(),
Expr: r.Expr,
// TODO: support End time
}
// 1. use data labels
for _, l := range m.Labels {
// drop __name__ to be consistent with Prometheus alerting
if l.Name == "__name__" {
@@ -170,28 +162,31 @@ func (r *Rule) newAlert(m datasource.Metric) (*notifier.Alert, error) {
}
a.Labels[l.Name] = l.Value
}
return a, r.template(a)
}
// 2. template rule labels with data labels
func (r *Rule) template(a *notifier.Alert) error {
// 1. template rule labels with data labels
rLabels, err := a.ExecTemplate(r.Labels)
if err != nil {
return a, err
return err
}
// 3. merge data labels and rule labels
// 2. merge data labels and rule labels
// metric labels may be overridden by
// rule labels
for k, v := range rLabels {
a.Labels[k] = v
}
// 4. template merged labels
// 3. template merged labels
a.Labels, err = a.ExecTemplate(a.Labels)
if err != nil {
return a, err
return err
}
a.Annotations, err = a.ExecTemplate(r.Annotations)
return a, err
return err
}
// AlertAPI generates APIAlert object from alert by its id(hash)
@@ -218,10 +213,11 @@ func (r *Rule) AlertsAPI() []*APIAlert {
func (r *Rule) newAlertAPI(a notifier.Alert) *APIAlert {
return &APIAlert{
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
// encode as strings to avoid rounding
ID: fmt.Sprintf("%d", a.ID),
GroupID: fmt.Sprintf("%d", a.GroupID),
Name: a.Name,
Group: a.Group,
Expression: r.Expr,
Labels: a.Labels,
Annotations: a.Annotations,
@@ -298,8 +294,12 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time)
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Eval, as well as Value field.
// to be updated on next Exec, as well as Value field.
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
// Get the last datapoint in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
@@ -314,7 +314,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Eval
// be identical to timeseries received in Exec
for _, l := range labels {
if l.Name == alertNameLabel {
continue
@@ -334,7 +334,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
a.State = notifier.StatePending
a.Start = time.Unix(int64(m.Value), 0)
r.alerts[a.ID] = a
logger.Infof("alert %q.%q restored to state at %v", a.Group, a.Name, a.Start)
logger.Infof("alert %q(%d) restored to state at %v", a.Name, a.ID, a.Start)
}
return nil
}

View File

@@ -2,6 +2,7 @@ package main
import (
"context"
"sync"
"testing"
"time"
@@ -295,16 +296,14 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending=>inactive", time.Millisecond),
newTestRule("for-pending=>empty", time.Second),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
// empty step to reset and delete pending alerts
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
map[uint64]*notifier.Alert{},
},
{
newTestRule("for-pending=>firing=>inactive", time.Millisecond),
@@ -392,19 +391,28 @@ func metricWithLabels(t *testing.T, labels ...string) datasource.Metric {
}
type fakeQuerier struct {
sync.Mutex
metrics []datasource.Metric
}
func (fq *fakeQuerier) reset() {
fq.Lock()
fq.metrics = fq.metrics[:0]
fq.Unlock()
}
func (fq *fakeQuerier) add(metrics ...datasource.Metric) {
fq.Lock()
fq.metrics = append(fq.metrics, metrics...)
fq.Unlock()
}
func (fq fakeQuerier) Query(ctx context.Context, query string) ([]datasource.Metric, error) {
return fq.metrics, nil
func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, error) {
fq.Lock()
cpy := make([]datasource.Metric, len(fq.metrics))
copy(cpy, fq.metrics)
fq.Unlock()
return cpy, nil
}
func TestRule_Restore(t *testing.T) {

View File

@@ -6,6 +6,7 @@ groups:
expr: vm_rows > 0
labels:
label: bar
expr: "{{ $expr|queryEscape }}"
annotations:
summary: "{{ $value|humanize }}"
description: "{{$labels}}"

11
app/vmalert/testdata/rules1-good.rules vendored Normal file
View File

@@ -0,0 +1,11 @@
groups:
- name: groupTest
rules:
- alert: VMRows
for: 1ms
expr: vm_rows > 0
labels:
label: bar
host: "{{ $labels.instance }}"
annotations:
summary: "{{ $value }}"

View File

@@ -10,13 +10,16 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
)
// APIAlert has info for an alert.
// APIAlert represents an notifier.Alert state
// for WEB view
type APIAlert struct {
ID string `json:"id"`
Name string `json:"name"`
Group string `json:"group"`
GroupID string `json:"group_id"`
Expression string `json:"expression"`
State string `json:"state"`
Value string `json:"value"`
@@ -26,14 +29,15 @@ type APIAlert struct {
}
type requestHandler struct {
groups []Group
m *manager
}
var pathList = [][]string{
{"/api/v1/alerts", "list all active alerts"},
{"/api/v1/groupName/alertID/status", "get alert status by ID"},
{"/api/v1/groupID/alertID/status", "get alert status by ID"},
// /metrics is served by httpserver by default
{"/metrics", "list of application metrics"},
{"/-/reload", "reload configuration"},
}
func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
@@ -48,6 +52,11 @@ func (rh *requestHandler) handler(w http.ResponseWriter, r *http.Request) bool {
case "/api/v1/alerts":
resph.handle(rh.list())
return true
case "/-/reload":
logger.Infof("api config reload was called, sending sighup")
procutil.SelfSIGHUP()
w.WriteHeader(http.StatusOK)
return true
default:
// /api/v1/<groupName>/<alertID>/status
if strings.HasSuffix(r.URL.Path, "/status") {
@@ -66,8 +75,10 @@ type listAlertsResponse struct {
}
func (rh *requestHandler) list() ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
lr := listAlertsResponse{Status: "success"}
for _, g := range rh.groups {
for _, g := range rh.m.groups {
for _, r := range g.Rules {
lr.Data.Alerts = append(lr.Data.Alerts, r.AlertsAPI()...)
}
@@ -89,6 +100,9 @@ func (rh *requestHandler) list() ([]byte, error) {
}
func (rh *requestHandler) alert(path string) ([]byte, error) {
rh.m.groupsMu.RLock()
defer rh.m.groupsMu.RUnlock()
parts := strings.SplitN(strings.TrimPrefix(path, "/api/v1/"), "/", 3)
if len(parts) != 3 {
return nil, &httpserver.ErrorWithStatusCode{
@@ -96,29 +110,20 @@ func (rh *requestHandler) alert(path string) ([]byte, error) {
StatusCode: http.StatusBadRequest,
}
}
group := strings.TrimRight(parts[0], "/")
idStr := strings.TrimRight(parts[1], "/")
id, err := strconv.ParseUint(idStr, 10, 0)
groupID, err := uint64FromPath(parts[0])
if err != nil {
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`cannot parse int from %q`, idStr),
StatusCode: http.StatusBadRequest,
}
return nil, badRequest(fmt.Errorf(`cannot parse groupID: %s`, err))
}
for _, g := range rh.groups {
if g.Name != group {
continue
}
for _, rule := range g.Rules {
if apiAlert := rule.AlertAPI(id); apiAlert != nil {
return json.Marshal(apiAlert)
}
}
alertID, err := uint64FromPath(parts[1])
if err != nil {
return nil, badRequest(fmt.Errorf(`cannot parse alertID: %s`, err))
}
return nil, &httpserver.ErrorWithStatusCode{
Err: fmt.Errorf(`cannot find alert %s in %q`, idStr, group),
StatusCode: http.StatusNotFound,
resp, err := rh.m.AlertAPI(groupID, alertID)
if err != nil {
return nil, errResponse(err, http.StatusNotFound)
}
return json.Marshal(resp)
}
// responseHandler wrapper on http.ResponseWriter with sugar
@@ -132,3 +137,19 @@ func (w responseHandler) handle(b []byte, err error) {
w.Header().Set("Content-Type", "application/json")
w.Write(b)
}
func uint64FromPath(path string) (uint64, error) {
s := strings.TrimRight(path, "/")
return strconv.ParseUint(s, 10, 0)
}
func badRequest(err error) *httpserver.ErrorWithStatusCode {
return errResponse(err, http.StatusBadRequest)
}
func errResponse(err error, sc int) *httpserver.ErrorWithStatusCode {
return &httpserver.ErrorWithStatusCode{
Err: err,
StatusCode: sc,
}
}

View File

@@ -17,12 +17,14 @@ func TestHandler(t *testing.T) {
0: {},
},
}
rh := &requestHandler{
groups: []Group{{
Name: "group",
Rules: []*Rule{rule},
}},
g := &Group{
Name: "group",
Rules: []*Rule{rule},
}
m := &manager{groups: make(map[uint64]*Group)}
m.groups[0] = g
rh := &requestHandler{m: m}
getResp := func(url string, to interface{}, code int) {
t.Helper()
resp, err := http.Get(url)
@@ -52,19 +54,19 @@ func TestHandler(t *testing.T) {
t.Errorf("expected 1 alert got %d", length)
}
})
t.Run("/api/v1/group/0/status", func(t *testing.T) {
t.Run("/api/v1/0/0/status", func(t *testing.T) {
alert := &APIAlert{}
getResp(ts.URL+"/api/v1/group/0/status", alert, 200)
getResp(ts.URL+"/api/v1/0/0/status", alert, 200)
expAlert := rule.newAlertAPI(*rule.alerts[0])
if !reflect.DeepEqual(alert, expAlert) {
t.Errorf("expected %v is equal to %v", alert, expAlert)
}
})
t.Run("/api/v1/group/1/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/group/1/status", nil, 404)
t.Run("/api/v1/0/1/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/0/1/status", nil, 404)
})
t.Run("/api/v1/unknown-group/0/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/unknown-group/0/status", nil, 404)
t.Run("/api/v1/1/0/status", func(t *testing.T) {
getResp(ts.URL+"/api/v1/1/0/status", nil, 404)
})
t.Run("/", func(t *testing.T) {
getResp(ts.URL, nil, 200)

View File

@@ -19,7 +19,7 @@ The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
Docker images for `vmauth` are available at [https://hub.docker.com/r/victoriametrics/vmauth/tags].
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.

View File

@@ -5,6 +5,7 @@ import (
"net/http"
"net/http/httputil"
"net/url"
"os"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
@@ -19,6 +20,8 @@ var (
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
envflag.Parse()
buildinfo.Init()
logger.Init()

View File

@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
creation of hourly, daily, weekly and monthly backups.
### Use cases

View File

@@ -31,6 +31,8 @@ var (
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()

View File

@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
### Troubleshooting
@@ -52,7 +53,7 @@ Run `vmrestore -help` in order to see all the available options:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
@@ -68,8 +69,8 @@ Run `vmrestore -help` in order to see all the available options:
-src string
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-storageDataPath string
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
-version
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
-version
Show VictoriaMetrics version
```

View File

@@ -3,6 +3,7 @@ package main
import (
"flag"
"fmt"
"os"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
@@ -16,13 +17,16 @@ var (
src = flag.String("src", "", "Source path with backup on the remote storage. "+
"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir")
storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Destination path where backup must be restored. "+
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup")
"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir "+
"is synchronized with -src contents, i.e. it works like 'rsync --delete'")
concurrency = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce restore duration")
maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum download speed. There is no limit if it is set to 0")
skipBackupCompleteCheck = flag.Bool("skipBackupCompleteCheck", false, "Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file")
)
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
buildinfo.Init()

View File

@@ -15,6 +15,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
@@ -395,14 +396,14 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %s", err)
}
date := time.Now().Unix() / secsPerDay
date := fasttime.UnixDate()
dateStr := r.FormValue("date")
if len(dateStr) > 0 {
t, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err)
}
date = t.Unix() / secsPerDay
date = uint64(t.Unix()) / secsPerDay
}
topN := 10
topNStr := r.FormValue("topN")
@@ -419,7 +420,7 @@ func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
}
topN = n
}
status, err := netstorage.GetTSDBStatusForDate(deadline, uint64(date), topN)
status, err := netstorage.GetTSDBStatusForDate(deadline, date, topN)
if err != nil {
return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err)
}
@@ -992,7 +993,7 @@ func getBool(r *http.Request, argKey string) bool {
}
func currentTime() int64 {
return int64(time.Now().UTC().Unix()) * 1e3
return int64(fasttime.UnixTimestamp() * 1000)
}
func getTagFilterssFromMatches(matches []string) ([][]storage.TagFilter, error) {

View File

@@ -43,6 +43,8 @@ var aggrFuncs = map[string]aggrFunc{
"bottomk_max": newAggrFuncRangeTopK(maxValue, true),
"bottomk_avg": newAggrFuncRangeTopK(avgValue, true),
"bottomk_median": newAggrFuncRangeTopK(medianValue, true),
"any": newAggrFunc(aggrFuncAny),
"outliersk": aggrFuncOutliersK,
}
type aggrFunc func(afa *aggrFuncArg) ([]*timeseries, error)
@@ -64,7 +66,7 @@ func newAggrFunc(afe func(tss []*timeseries) []*timeseries) aggrFunc {
if err := expectTransformArgsNum(args, 1); err != nil {
return nil, err
}
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, false)
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, false)
}
}
@@ -80,7 +82,7 @@ func removeGroupTags(metricName *storage.MetricName, modifier *metricsql.Modifie
}
}
func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, keepOriginal bool) ([]*timeseries, error) {
func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, maxSeries int, keepOriginal bool) ([]*timeseries, error) {
arg := copyTimeseriesMetricNames(argOrig)
// Perform grouping.
@@ -92,7 +94,13 @@ func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeserie
if keepOriginal {
ts = argOrig[i]
}
m[string(bb.B)] = append(m[string(bb.B)], ts)
tss := m[string(bb.B)]
if tss == nil && maxSeries > 0 && len(m) >= maxSeries {
// We already reached time series limit after grouping. Skip other time series.
continue
}
tss = append(tss, ts)
m[string(bb.B)] = tss
}
bbPool.Put(bb)
@@ -112,6 +120,10 @@ func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeserie
return rvs, nil
}
func aggrFuncAny(tss []*timeseries) []*timeseries {
return tss[:1]
}
func aggrFuncSum(tss []*timeseries) []*timeseries {
if len(tss) == 1 {
// Fast path - nothing to sum.
@@ -441,7 +453,7 @@ func aggrFuncCountValues(afa *aggrFuncArg) ([]*timeseries, error) {
}
return rvs
}
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, false)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, false)
}
func newAggrFuncTopK(isReverse bool) aggrFunc {
@@ -468,15 +480,10 @@ func newAggrFuncTopK(isReverse bool) aggrFunc {
}
return removeNaNs(tss)
}
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
}
}
type tsWithValue struct {
ts *timeseries
value float64
}
func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggrFunc {
return func(afa *aggrFuncArg) ([]*timeseries, error) {
args := afa.args
@@ -488,34 +495,42 @@ func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggr
return nil, err
}
afe := func(tss []*timeseries) []*timeseries {
maxs := make([]tsWithValue, len(tss))
for i, ts := range tss {
value := f(ts.Values)
maxs[i] = tsWithValue{
ts: ts,
value: value,
}
}
sort.Slice(maxs, func(i, j int) bool {
a := maxs[i].value
b := maxs[j].value
if isReverse {
a, b = b, a
}
return lessWithNaNs(a, b)
})
for i := range maxs {
tss[i] = maxs[i].ts
}
for i, k := range ks {
fillNaNsAtIdx(i, k, tss)
}
return removeNaNs(tss)
return getRangeTopKTimeseries(tss, ks, f, isReverse)
}
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
}
}
func getRangeTopKTimeseries(tss []*timeseries, ks []float64, f func(values []float64) float64, isReverse bool) []*timeseries {
type tsWithValue struct {
ts *timeseries
value float64
}
maxs := make([]tsWithValue, len(tss))
for i, ts := range tss {
value := f(ts.Values)
maxs[i] = tsWithValue{
ts: ts,
value: value,
}
}
sort.Slice(maxs, func(i, j int) bool {
a := maxs[i].value
b := maxs[j].value
if isReverse {
a, b = b, a
}
return lessWithNaNs(a, b)
})
for i := range maxs {
tss[i] = maxs[i].ts
}
for i, k := range ks {
fillNaNsAtIdx(i, k, tss)
}
return removeNaNs(tss)
}
func fillNaNsAtIdx(idx int, k float64, tss []*timeseries) {
if math.IsNaN(k) {
k = 0
@@ -577,16 +592,54 @@ func avgValue(values []float64) float64 {
func medianValue(values []float64) float64 {
h := histogram.GetFast()
for _, v := range values {
if math.IsNaN(v) {
continue
if !math.IsNaN(v) {
h.Update(v)
}
h.Update(v)
}
value := h.Quantile(0.5)
histogram.PutFast(h)
return value
}
func aggrFuncOutliersK(afa *aggrFuncArg) ([]*timeseries, error) {
args := afa.args
if err := expectTransformArgsNum(args, 2); err != nil {
return nil, err
}
ks, err := getScalar(args[0], 0)
if err != nil {
return nil, err
}
afe := func(tss []*timeseries) []*timeseries {
// Calculate medians for each point across tss.
medians := make([]float64, len(ks))
h := histogram.GetFast()
for n := range ks {
h.Reset()
for j := range tss {
v := tss[j].Values[n]
if !math.IsNaN(v) {
h.Update(v)
}
}
medians[n] = h.Quantile(0.5)
}
histogram.PutFast(h)
// Return topK time series with the highest variance from median.
f := func(values []float64) float64 {
sum2 := float64(0)
for n, v := range values {
d := v - medians[n]
sum2 += d * d
}
return sum2
}
return getRangeTopKTimeseries(tss, ks, f, false)
}
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
}
func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
args := afa.args
if err := expectTransformArgsNum(args, 2); err != nil {
@@ -618,7 +671,7 @@ func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
}
return tss
}
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, true)
}
func aggrFuncQuantile(afa *aggrFuncArg) ([]*timeseries, error) {
@@ -631,7 +684,7 @@ func aggrFuncQuantile(afa *aggrFuncArg) ([]*timeseries, error) {
return nil, err
}
afe := newAggrQuantileFunc(phis)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, false)
return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, false)
}
func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
@@ -641,30 +694,24 @@ func aggrFuncMedian(afa *aggrFuncArg) ([]*timeseries, error) {
}
phis := evalNumber(afa.ec, 0.5)[0].Values
afe := newAggrQuantileFunc(phis)
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, false)
return aggrFuncExt(afe, args[0], &afa.ae.Modifier, afa.ae.Limit, false)
}
func newAggrQuantileFunc(phis []float64) func(tss []*timeseries) []*timeseries {
return func(tss []*timeseries) []*timeseries {
dst := tss[0]
h := histogram.GetFast()
defer histogram.PutFast(h)
for n := range dst.Values {
sort.Slice(tss, func(i, j int) bool {
a := tss[i].Values[n]
b := tss[j].Values[n]
return lessWithNaNs(a, b)
})
h.Reset()
for j := range tss {
v := tss[j].Values[n]
if !math.IsNaN(v) {
h.Update(v)
}
}
phi := phis[n]
if math.IsNaN(phi) {
phi = 1
}
if phi < 0 {
phi = 0
}
if phi > 1 {
phi = 1
}
idx := int(math.Round(float64(len(tss)-1) * phi))
dst.Values[n] = tss[idx].Values[n]
dst.Values[n] = h.Quantile(phi)
}
tss[0] = dst
return tss[:1]

View File

@@ -48,6 +48,11 @@ var incrementalAggrFuncCallbacksMap = map[string]*incrementalAggrFuncCallbacks{
mergeAggrFunc: mergeAggrGeomean,
finalizeAggrFunc: finalizeAggrGeomean,
},
"any": {
updateAggrFunc: updateAggrAny,
mergeAggrFunc: mergeAggrAny,
finalizeAggrFunc: finalizeAggrCommon,
},
}
type incrementalAggrFuncContext struct {
@@ -81,6 +86,10 @@ func (iafc *incrementalAggrFuncContext) updateTimeseries(ts *timeseries, workerI
bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
iac := m[string(bb.B)]
if iac == nil {
if iafc.ae.Limit > 0 && len(m) >= iafc.ae.Limit {
// Skip this time series, since the limit on the number of output time series has been already reached.
return
}
tsAggr := &timeseries{
Values: make([]float64, len(ts.Values)),
Timestamps: ts.Timestamps,
@@ -106,6 +115,10 @@ func (iafc *incrementalAggrFuncContext) finalizeTimeseries() []*timeseries {
for k, iac := range m {
iacGlobal := mGlobal[k]
if iacGlobal == nil {
if iafc.ae.Limit > 0 && len(mGlobal) >= iafc.ae.Limit {
// Skip this time series, since the limit on the number of output time series has been already reached.
continue
}
mGlobal[k] = iac
continue
}
@@ -450,3 +463,25 @@ func finalizeAggrGeomean(iac *incrementalAggrContext) {
dstValues[i] = math.Pow(dstValues[i], 1/v)
}
}
func updateAggrAny(iac *incrementalAggrContext, values []float64) {
dstCounts := iac.values
if dstCounts[0] > 0 {
return
}
for i := range values {
dstCounts[i] = 1
}
iac.ts.Values = append(iac.ts.Values[:0], values...)
}
func mergeAggrAny(dst, src *incrementalAggrContext) {
srcValues := src.ts.Values
srcCounts := src.values
dstCounts := dst.values
if dstCounts[0] > 0 {
return
}
dstCounts[0] = srcCounts[0]
dst.ts.Values = append(dst.ts.Values[:0], srcValues...)
}

View File

@@ -663,12 +663,17 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
pointsPerTimeseries := 1 + (ec.End-ec.Start)/ec.Step
timeseriesLen := rssLen
if iafc != nil {
// Incremental aggregates require hold only GOMAXPROCS timeseries in memory.
// Incremental aggregates require holding only GOMAXPROCS timeseries in memory.
timeseriesLen = runtime.GOMAXPROCS(-1)
if iafc.ae.Modifier.Op != "" {
// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
// since each group can have own set of time series in memory.
timeseriesLen *= 1000
if iafc.ae.Limit > 0 {
// There is an explicit limit on the number of output time series.
timeseriesLen *= iafc.ae.Limit
} else {
// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
// since each group can have own set of time series in memory.
timeseriesLen *= 1000
}
}
// The maximum number of output time series is limited by rssLen.
if timeseriesLen > rssLen {

View File

@@ -3496,6 +3496,21 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
t.Run(`sum(multi-vector) by (known-tag) limit 1`, func(t *testing.T) {
t.Parallel()
q := `sum(label_set(10, "foo", "bar") or label_set(time()/100, "baz", "sss")) by (foo) limit 1`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10, 10, 10},
Timestamps: timestampsExpected,
}
r.MetricName.Tags = []storage.Tag{{
Key: []byte("foo"),
Value: []byte("bar"),
}}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`sum(multi-vector) by (known-tags)`, func(t *testing.T) {
t.Parallel()
q := `sum(label_set(10, "foo", "bar", "baz", "sss", "x", "y") or label_set(time()/100, "baz", "sss", "foo", "bar")) by (foo, baz, foo)`
@@ -3562,7 +3577,7 @@ func TestExecSuccess(t *testing.T) {
q := `sort(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{14, 15, 12, 13, 15, 11},
Values: []float64{14, 16, 12, 13, 15, 11},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{
@@ -3592,7 +3607,7 @@ func TestExecSuccess(t *testing.T) {
}
r3 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{13, 11, 16, 19, 13, 16},
Values: []float64{13, 10, 16, 19, 13, 16},
Timestamps: timestampsExpected,
}
r3.MetricName.Tags = []storage.Tag{
@@ -3613,7 +3628,7 @@ func TestExecSuccess(t *testing.T) {
q := `sort(sum(histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s])) by (vmrange))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{14, 15, 12, 13, 15, 11},
Values: []float64{14, 16, 12, 13, 15, 11},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{
@@ -3635,7 +3650,7 @@ func TestExecSuccess(t *testing.T) {
}
r3 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{13, 11, 16, 19, 13, 16},
Values: []float64{13, 10, 16, 19, 13, 16},
Timestamps: timestampsExpected,
}
r3.MetricName.Tags = []storage.Tag{
@@ -3663,7 +3678,7 @@ func TestExecSuccess(t *testing.T) {
q := `topk_max(1, histogram_over_time(alias(label_set(rand(0)*1.3+1.1, "foo", "bar"), "xxx")[200s:5s]))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{13, 11, 16, 19, 13, 16},
Values: []float64{13, 10, 16, 19, 13, 16},
Timestamps: timestampsExpected,
}
r.MetricName.Tags = []storage.Tag{
@@ -3768,6 +3783,17 @@ func TestExecSuccess(t *testing.T) {
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
t.Run(`any()`, func(t *testing.T) {
t.Parallel()
q := `any(label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10, 10, 10},
Timestamps: timestampsExpected,
}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`topk(-1)`, func(t *testing.T) {
t.Parallel()
q := `sort(topk(-1, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss")))`
@@ -4170,14 +4196,63 @@ func TestExecSuccess(t *testing.T) {
t.Run(`quantile(NaN)`, func(t *testing.T) {
t.Parallel()
q := `quantile(NaN, label_set(10, "foo", "bar") or label_set(time()/150, "baz", "sss"))`
resultExpected := []netstorage.Result{}
f(q, resultExpected)
})
t.Run(`outliersk(0)`, func(t *testing.T) {
t.Parallel()
q := `outliersk(0, (
label_set(1300, "foo", "bar"),
label_set(time(), "baz", "sss"),
))`
resultExpected := []netstorage.Result{}
f(q, resultExpected)
})
t.Run(`outliersk(1)`, func(t *testing.T) {
t.Parallel()
q := `outliersk(1, (
label_set(2000, "foo", "bar"),
label_set(time(), "baz", "sss"),
))`
r := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{10, 10, 10, 10.666666666666666, 12, 13.333333333333334},
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Timestamps: timestampsExpected,
}
r.MetricName.Tags = []storage.Tag{{
Key: []byte("baz"),
Value: []byte("sss"),
}}
resultExpected := []netstorage.Result{r}
f(q, resultExpected)
})
t.Run(`outliersk(3)`, func(t *testing.T) {
t.Parallel()
q := `sort_desc(outliersk(3, (
label_set(1300, "foo", "bar"),
label_set(time(), "baz", "sss"),
)))`
r1 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1000, 1200, 1400, 1600, 1800, 2000},
Timestamps: timestampsExpected,
}
r1.MetricName.Tags = []storage.Tag{{
Key: []byte("baz"),
Value: []byte("sss"),
}}
r2 := netstorage.Result{
MetricName: metricNameExpected,
Values: []float64{1300, 1300, 1300, 1300, 1300, 1300},
Timestamps: timestampsExpected,
}
r2.MetricName.Tags = []storage.Tag{{
Key: []byte("foo"),
Value: []byte("bar"),
}}
resultExpected := []netstorage.Result{r1, r2}
f(q, resultExpected)
})
t.Run(`range_quantile(0.5)`, func(t *testing.T) {
t.Parallel()
q := `range_quantile(0.5, time())`
@@ -5505,6 +5580,8 @@ func TestExecError(t *testing.T) {
f(`hoeffding_bound_upper()`)
f(`hoeffding_bound_upper(1)`)
f(`hoeffding_bound_upper(0.99, foo, 1)`)
f(`outliersk()`)
f(`outliersk(1)`)
// Invalid argument type
f(`median_over_time({}, 2)`)
@@ -5544,6 +5621,7 @@ func TestExecError(t *testing.T) {
f(`alias(1, 2)`)
f(`aggr_over_time(1, 2)`)
f(`aggr_over_time(("foo", "bar"), 3)`)
f(`outliersk((label_set(1, "foo", "bar"), label_set(2, "x", "y")), 123)`)
// Duplicate timeseries
f(`(label_set(1, "foo", "bar") or label_set(2, "foo", "baz"))

View File

@@ -72,6 +72,8 @@ var rollupFuncs = map[string]newRollupFunc{
"aggr_over_time": newRollupFuncTwoArgs(rollupFake),
"hoeffding_bound_upper": newRollupHoeffdingBoundUpper,
"hoeffding_bound_lower": newRollupHoeffdingBoundLower,
"ascent_over_time": newRollupFuncOneArg(rollupAscentOverTime),
"descent_over_time": newRollupFuncOneArg(rollupDescentOverTime),
// `timestamp` function must return timestamp for the last datapoint on the current window
// in order to properly handle offset and timestamps unaligned to the current step.
@@ -116,6 +118,9 @@ var rollupAggrFuncs = map[string]rollupFunc{
"scrape_interval": rollupScrapeInterval,
"tmin_over_time": rollupTmin,
"tmax_over_time": rollupTmax,
"ascent_over_time": rollupAscentOverTime,
"descent_over_time": rollupDescentOverTime,
"timestamp": rollupTimestamp,
}
var rollupFuncsCannotAdjustWindow = map[string]bool{
@@ -138,6 +143,8 @@ var rollupFuncsCannotAdjustWindow = map[string]bool{
"increases_over_time": true,
"decreases_over_time": true,
"integrate": true,
"ascent_over_time": true,
"descent_over_time": true,
}
var rollupFuncsRemoveCounterResets = map[string]bool{
@@ -1527,6 +1534,52 @@ func rollupTimestamp(rfa *rollupFuncArg) float64 {
return float64(timestamps[len(timestamps)-1]) / 1e3
}
func rollupAscentOverTime(rfa *rollupFuncArg) float64 {
// There is no need in handling NaNs here, since they must be cleaned up
// before calling rollup funcs.
values := rfa.values
prevValue := rfa.prevValue
if math.IsNaN(prevValue) {
if len(values) == 0 {
return nan
}
prevValue = values[0]
values = values[1:]
}
s := float64(0)
for _, v := range values {
d := v - prevValue
if d > 0 {
s += d
}
prevValue = v
}
return s
}
func rollupDescentOverTime(rfa *rollupFuncArg) float64 {
// There is no need in handling NaNs here, since they must be cleaned up
// before calling rollup funcs.
values := rfa.values
prevValue := rfa.prevValue
if math.IsNaN(prevValue) {
if len(values) == 0 {
return nan
}
prevValue = values[0]
values = values[1:]
}
s := float64(0)
for _, v := range values {
d := prevValue - v
if d > 0 {
s += d
}
prevValue = v
}
return s
}
func rollupFirst(rfa *rollupFuncArg) float64 {
// There is no need in handling NaNs here, since they must be cleaned up
// before calling rollup funcs.

View File

@@ -10,6 +10,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
@@ -64,18 +65,18 @@ func InitRollupResultCache(cachePath string) {
stats := &fastcache.Stats{}
var statsLock sync.Mutex
var statsLastUpdate time.Time
var statsLastUpdate uint64
fcs := func() *fastcache.Stats {
statsLock.Lock()
defer statsLock.Unlock()
if time.Since(statsLastUpdate) < time.Second {
if fasttime.UnixTimestamp()-statsLastUpdate < 2 {
return stats
}
var fcs fastcache.Stats
c.UpdateStats(&fcs)
stats = &fcs
statsLastUpdate = time.Now()
statsLastUpdate = fasttime.UnixTimestamp()
return stats
}
if len(rollupResultCachePath) > 0 {

View File

@@ -389,6 +389,9 @@ func TestRollupNewRollupFuncSuccess(t *testing.T) {
f("ideriv", 0)
f("decreases_over_time", 5)
f("increases_over_time", 5)
f("ascent_over_time", 142)
f("descent_over_time", 231)
f("timestamp", 0.13)
}
func TestRollupNewRollupFuncError(t *testing.T) {

View File

@@ -409,6 +409,16 @@ func registerStorageMetrics() {
return float64(m().AddRowsConcurrencyCurrent)
})
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
return float64(m().SlowRowInserts)
})
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
return float64(m().SlowPerDayIndexInserts)
})
metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
return float64(m().SlowMetricNameLoads)
})
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsCount)
})
@@ -452,6 +462,9 @@ func registerStorageMetrics() {
metrics.NewGauge(`vm_cache_entries{type="storage/hour_metric_ids"}`, func() float64 {
return float64(m().HourMetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/next_day_metric_ids"}`, func() float64 {
return float64(m().NextDayMetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/bigIndexBlocks"}`, func() float64 {
return float64(tm().BigIndexBlocksCacheSize)
})
@@ -492,6 +505,9 @@ func registerStorageMetrics() {
metrics.NewGauge(`vm_cache_size_bytes{type="storage/hour_metric_ids"}`, func() float64 {
return float64(m().HourMetricIDCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/next_day_metric_ids"}`, func() float64 {
return float64(m().NextDayMetricIDCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagCacheSizeBytes)
})

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ DOCKER_NAMESPACE := victoriametrics
ROOT_IMAGE ?= scratch
CERTS_IMAGE := alpine:3.11
GO_BUILDER_IMAGE := golang:1.14.2
GO_BUILDER_IMAGE := golang:1.14.3
BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr : _)
BASE_IMAGE := local/base:1.1.1-$(shell echo $(ROOT_IMAGE) | tr : _)-$(shell echo $(CERTS_IMAGE) | tr : _)
@@ -32,7 +32,9 @@ app-via-docker: package-base package-builder
--env GO111MODULE=on \
$(DOCKER_OPTS) \
$(BUILDER_IMAGE) \
go build $(RACE) -mod=vendor -trimpath -ldflags "-s -w -extldflags '-static' $(GO_BUILDINFO)" -tags 'netgo osusergo' \
go build $(RACE) -mod=vendor -trimpath \
-ldflags "-extldflags '-static' $(GO_BUILDINFO)" \
-tags 'netgo osusergo nethttpomithttp2' \
-o bin/$(APP_NAME)$(APP_SUFFIX)-prod $(PKG_PREFIX)/app/$(APP_NAME)
package-via-docker:

View File

@@ -30,6 +30,8 @@
* [Better Prometheus rate() function with VictoriaMetrics](https://www.percona.com/blog/2020/02/28/better-prometheus-rate-function-with-victoriametrics/)
* [Infrastructure monitoring with Prometheus at Zerodha](https://zerodha.tech/blog/infra-monitoring-at-zerodha/)
* [CMS monitoring R&D: Real-time monitoring and alerts](https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf)
* [Disk usage: VictoriaMetrics vs Prometheus](https://stas.starikevich.com/posts/disk-usage-for-vm-versus-prometheus/)
* [Benchmarking time series workloads on Apache Kudu using TSBS](https://blog.cloudera.com/benchmarking-time-series-workloads-on-apache-kudu-using-tsbs/)
* [What are Open Source Time Series Databases?](https://www.iunera.com/kraken/fabric/time-series-database/)
* [Evaluating performance and correctness](https://www.robustperception.io/evaluating-performance-and-correctness)

View File

@@ -3,6 +3,7 @@
Below are approved public case studies and talks from VictoriaMetrics users. Join our [community Slack channel](http://slack.victoriametrics.com/)
and feel free asking for references, reviews and additional case studies from real VictoriaMetrics users there.
## Adidas
See [slides](https://promcon.io/2019-munich/slides/remote-write-storage-wars.pdf) and [video](https://youtu.be/OsH6gPdxR4s)
@@ -10,6 +11,20 @@ from [Remote Write Storage Wars](https://promcon.io/2019-munich/talks/remote-wri
VictoriaMetrics is compared to Thanos, Corex and M3DB in the talk.
## CERN
The European Organization for Nuclear Research known as [CERN](https://home.cern/) uses VictoriaMetrics for real-time monitoring
of the [CMS](https://home.cern/science/experiments/cms) detector system.
According to [published talk](https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf)
VictoriaMetrics is used for the following purposes as a part of "CMS Monitoring cluster":
* As long-term storage for messages consumed from the [NATS messaging system](https://nats.io/). Consumed messages are pushed directly to VictoriaMetrics via HTTP protocol
* As long-term storage for Prometheus monitoring system (30 days retention policy, there are plans to increase it up to ½ year)
* As a data source for visualizing metrics in Grafana.
R&D topic: Evaluate VictoraMetrics vs InfluxDB for large cardinality data.
## COLOPL
[COLOPL](http://www.colopl.co.jp/en/) is Japaneese Game Development company. It started using VictoriaMetrics

View File

@@ -167,21 +167,23 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
- `<accountID>` is an arbitrary 32-bit integer identifying namespace for data ingestion (aka tenant). It is possible to set it as `accountID:projectID`,
where `projectID` is also arbitrary 32-bit integer. If `projectID` isn't set, then it equals to `0`.
- `<suffix>` may have the following values:
- `prometheus` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
- `influx/write` or `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
- `prometheus` and `prometheus/api/v1/write` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
- `influx/write` and `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/).
- `opentsdb/api/put` - for accepting [OpenTSDB HTTP /api/put requests](http://opentsdb.net/docs/build/html/api_http/put.html).
- `prometheus/api/v1/import` - for importing data obtained via `api/v1/export` on `vmselect` (see below).
- `prometheus/api/v1/import/csv` - for importing arbitrary CSV data. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-csv-data) for details.
* URLs for querying: `http://<vmselect>:8481/select/<accountID>/prometheus/<suffix>`, where:
- `<accountID>` is an arbitrary number identifying data namespace for the query (aka tenant)
- `<suffix>` may have the following values:
- `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries)
- `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries)
- `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
- `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
- `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
- `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/)
- `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details
- `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries).
- `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
- `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers).
- `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names).
- `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values).
- `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/).
- `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details.
- `api/v1/status/tsdb` - for time series stats. See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details.
* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't

View File

@@ -4,6 +4,8 @@ VictoriaMetrics implements MetricsQL - query language inspired by [PromQL](https
It is backwards compatible with PromQL, so Grafana dashboards backed by Prometheus datasource should work the same after switching from Prometheus to VictoriaMetrics.
[Standalone MetricsQL package](https://godoc.org/github.com/VictoriaMetrics/metricsql) can be used for parsing MetricsQL in external apps.
If you are unfamiliar with PromQL, then it is suggested reading [this tutorial for beginners](https://medium.com/@valyala/promql-tutorial-for-beginners-9ab455142085).
The following functionality is implemented differently in MetricsQL comparing to PromQL in order to improve user experience:
* MetricsQL takes into account the previous point before the window in square brackets for range functions such as `rate` and `increase`.
It also doesn't extrapolate range function results. This addresses [this issue from Prometheus](https://github.com/prometheus/prometheus/issues/3746).
@@ -22,6 +24,8 @@ Feel free [filing a feature request](https://github.com/VictoriaMetrics/Victoria
This functionality can be tried at [an editable Grafana dashboard](http://play-grafana.victoriametrics.com:3000/d/4ome8yJmz/node-exporter-on-victoriametrics-demo).
- [`WITH` templates](https://play.victoriametrics.com/promql/expand-with-exprs). This feature simplifies writing and managing complex queries. Go to [`WITH` templates playground](https://victoriametrics.com/promql/expand-with-exprs) and try it.
- All the aggregate functions support optional `limit N` suffix in order to limit the number of output series. For example, `sum(x) by (y) limit 10` limits
the number of output time series after the aggregation to 10. All the other time series are dropped.
- Metric names and metric labels may contain escaped chars. For instance, `foo\-bar{baz\=aa="b"}` is valid expression. It returns time series with name `foo-bar` containing label `baz=aa` with value `b`. Additionally, `\xXX` escape sequence is supported, where `XX` is hexadecimal representation of escaped char.
- `offset`, range duration and step value for range vector may refer to the current step aka `$__interval` value from Grafana.
For instance, `rate(metric[10i] offset 5i)` would return per-second rate over a range covering 10 previous steps with the offset of 5 steps.
@@ -72,6 +76,8 @@ This functionality can be tried at [an editable Grafana dashboard](http://play-g
- `median_over_time(m[d])` - calculates median values for `m` over `d` time window. Shorthand to `quantile_over_time(0.5, m[d])`.
- `median(q)` - median aggregate. Shorthand to `quantile(0.5, q)`.
- `limitk(k, q)` - limits the number of time series returned from `q` to `k`.
- `any(q) by (x)` - returns any time series from `q` for each group in `x`. Note that `any()` removes all the labels except of those listed in `by (x)`.
Use `limitk(1, q)` if you need retaining all the labels from `q`.
- `keep_last_value(q)` - fills missing data (gaps) in `q` with the previous non-empty value.
- `keep_next_value(q)` - fills missing data (gaps) in `q` with the next non-empty value.
- `distinct_over_time(m[d])` - returns distinct number of values for `m` data points over `d` duration.
@@ -112,3 +118,7 @@ This functionality can be tried at [an editable Grafana dashboard](http://play-g
for the given `phi` in the range `[0..1]`.
- `last_over_time(m[d])` - returns the last value for `m` on the time range `d`.
- `first_over_time(m[d])` - returns the first value for `m` on the time range `d`.
- `outliersk(N, m)` - returns up to `N` outlier time series for `m`. Outlier time series have the highest deviation from the `median(m)`.
This aggregate function is useful to detect anomalies across groups of similar time series.
- `ascent_over_time(m[d])` - returns the sum of positive deltas between adjancent data points in `m` over `d`. Useful for tracking height gains in GPS track.
- `descent_over_time(m[d])` - returns the absolute sum of negative deltas between adjancent data points in `m` over `d`. Useful for tracking height loss in GPS track.

View File

@@ -1,7 +1,7 @@
# Quick Start
1. Download the latest VictoriaMetrics release from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
from [Docker hub](https://hub.docker.com/r/valyala/victoria-metrics/)
from [Docker hub](https://hub.docker.com/r/victoriametrics/victoria-metrics/)
or [build it from sources](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#how-to-build-from-sources).
2. Run the binary or Docker image with the desired command-line flags. Pass `-help` in order to see description for all the available flags
@@ -17,8 +17,10 @@
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/43) in order to configure VictoriaMetrics as OS service.
It is recommended setting up [VictoriaMetrics monitoring](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#monitoring).
3. Configure all the Prometheus instances to write data to VictoriaMetrics.
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#prometheus-setup).
3. Configure [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) or Prometheus to write data to VictoriaMetrics.
It is recommended to use `vmagent` instead of Prometheus, since it is more resource efficient. If you still prefer Prometheus, then
see [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#prometheus-setup)
for details on how to configure Prometheus.
4. Configure Grafana to query VictoriaMetrics instead of Prometheus.
See [these instructions](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics#grafana-setup).

View File

@@ -10,16 +10,21 @@
## VictoriaMetrics
VictoriaMetrics is fast, cost-effective and scalable time-series database. It can be used as long-term remote storage for Prometheus.
VictoriaMetrics is fast, cost-effective and scalable time-series database.
It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
[docker images](https://hub.docker.com/r/victoriametrics/victoria-metrics/) and
in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics). Just download VictoriaMetrics and see [how to start it](#how-to-start-victoriametrics).
Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
[Contact us](mailto:info@victoriametrics.com) if you need paid enterprise support for VictoriaMetrics.
## Case studies and talks
* [Adidas](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#adidas)
* [CERN](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#cern)
* [COLOPL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#colopl)
* [Wix.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wixcom)
* [Wedos.com](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/CaseStudies#wedoscom)
@@ -33,6 +38,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
## Prominent features
* VictoriaMetrics can be used as long-term storage for Prometheus or for [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md).
See [these docs](#prometheus-setup) for details.
* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
VictoriaMetrics implements [MetricsQL](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/MetricsQL) query language, which is inspired by PromQL.
* Supports global query view. Multiple Prometheus instances may write data into VictoriaMetrics. Later this data may be used in a single query.
@@ -115,6 +122,8 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
* [Monitoring](#monitoring)
* [Troubleshooting](#troubleshooting)
* [Backfilling](#backfilling)
* [Replication](#replication)
* [Backups](#backups)
* [Profiling](#profiling)
* [Integrations](#integrations)
* [Third-party contributions](#third-party-contributions)
@@ -139,6 +148,8 @@ The following command-line flags are used the most:
* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted. Default period is 1 month.
* `-httpListenAddr` - TCP address to listen to for http requests. By default, it listens port `8428` on all the network interfaces.
Other flags have good enough default values, so set them only if you really need this.
Pass `-help` to see all the available flags with description and default values.
It is recommended setting up [monitoring](#monitoring) for VictoriaMetrics.
@@ -779,6 +790,8 @@ remote_write:
kill -HUP `pidof prometheus`
```
It is recommended to use [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md) instead of Prometheus for highly loaded setups.
4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
6) Set up Prometheus datasource in Grafana that points to Promxy.
@@ -789,6 +802,7 @@ to write data to `victoriametrics-addr-1`, while each `r2` should write data to
Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
with the enabled de-duplication. See [this section](#deduplication) for details.
### Deduplication
VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
@@ -890,7 +904,8 @@ mkfs.ext4 ... -O 64bit,huge_file,extent -T huge
### Monitoring
VictoriaMetrics exports internal metrics in Prometheus format at `/metrics` page.
These metrics may be collected via Prometheus by adding the corresponding scrape config to it.
These metrics may be collected by [vmagent](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmagent/README.md)
or Prometheus by adding the corresponding scrape config to it.
Alternatively they can be self-scraped by setting `-selfScrapeInterval` command-line flag to duration greater than 0.
For example, `-selfScrapeInterval=10s` would enable self-scraping of `/metrics` page with 10 seconds interval.
@@ -901,13 +916,17 @@ The most interesting metrics are:
* `vm_cache_entries{type="storage/hour_metric_ids"}` - the number of time series with new data points during the last hour
aka active time series.
* `rate(vm_new_timeseries_created_total[5m])` - time series churn rate.
* `vm_rows{type="indexdb"}` - the number of rows in inverted index. High value for this number usually mean high churn rate for time series.
* Sum of `vm_rows{type="storage/big"}` and `vm_rows{type="storage/small"}` - total number of `(timestamp, value)` data points
in the database.
* `vm_rows_inserted_total` - the total number of inserted rows since VictoriaMetrics start.
* `increase(vm_new_timeseries_created_total[1h])` - time series churn rate during the previous hour.
* `sum(vm_rows{type=~"storage/.*"})` - total number of `(timestamp, value)` data points in the database.
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total data size on disk.
* `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
of the current number of active time series.
### Troubleshooting
@@ -920,8 +939,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM.
VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance.
ingestion and query performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
@@ -968,6 +988,24 @@ the query cache, which could contain incomplete data cached during the backfilli
Yet another solution is to increase `-search.cacheTimestampOffset` flag value in order to disable caching
for data with timestamps close to the current time.
### Replication
VictoriaMetrics relies on replicated durable persistent storage such as [Google Cloud disks](https://cloud.google.com/compute/docs/disks#pdspecs)
or [Amazon EBS](https://aws.amazon.com/ebs/). It is also recommended making periodic backups,
since [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).
See [backup docs](#backups) for details.
See also [high availability docs](#high-availability) and [docs about cluster version of VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/README.md).
### Backups
VictoriaMetrics supports backups via [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md)
and [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md) tools.
We also provide provide `vmbackuper` tool for paid enterprise subscribers - see [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) for details.
### Profiling
VictoriaMetrics provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs):

View File

@@ -1,8 +1,8 @@
## vmagent
`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports `remote_write` protocol.
`vmagent` is a tiny but brave agent, which helps you collect metrics from various sources
and stores them in [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
or any other Prometheus-compatible storage system that supports the `remote_write` protocol.
<img alt="vmagent" src="vmagent.png">
@@ -11,7 +11,7 @@ or any other Prometheus-compatible storage system that supports `remote_write` p
While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
Also, we found that users infrastructure is like snowflakes - never alike, and we decided to add more flexibility
Also, we found that users infrastructure are snowflakes - no two are alike, and we decided to add more flexibility
to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
@@ -31,7 +31,7 @@ to `vmagent` (like the ability to push metrics instead of pulling them). We did
* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
to remote storage is recovered. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
* Uses lower amounts of RAM, CPU, disk IO and network bandwidth compared to Prometheus.
### Quick Start
@@ -40,8 +40,7 @@ Just download `vmutils-*` archive from [releases page](https://github.com/Victor
and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
in order to replicate data concurrently to multiple remote storage systems.
* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. The `-remoteWrite.url` argument can be specified multiple times in order to replicate data concurrently to an arbitrary amount of remote storage systems.
Example command line:
@@ -49,7 +48,7 @@ Example command line:
/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
```
If you need collecting only Influx data, then the following command line would be enough:
If you only need to collect Influx data, then the following is sufficient:
```
/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
@@ -79,14 +78,14 @@ See [the corresponding Makefile rules](https://github.com/VictoriaMetrics/Victor
#### Drop-in replacement for Prometheus
If you use Prometheus only for scraping metrics from various targets and forwarding these metrics to remote storage,
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such setup.
then `vmagent` can replace such Prometheus setup. Usually `vmagent` requires lower amounts of RAM, CPU and network bandwidth comparing to Prometheus for such a setup.
See [these docs](#how-to-collect-metrics-in-prometheus-format) for details.
#### Replication and high availability
`vmagent` replicates the collected metrics among multiple remote storage instances configured via `-remoteWrite.url` args.
If a single remote storage instance temporarily goes out of service, then the collected data remains available in another remote storage instances.
If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instances.
`vmagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again.
Then it sends the buffered data to the remote storage in order to prevent data gaps in the remote storage.
@@ -94,13 +93,13 @@ Then it sends the buffered data to the remote storage in order to prevent data g
#### Relabeling and filtering
`vmagent` can add, remove or update labels on the collected data before sending it to remote storage. Additionally,
it can remove unneeded samples via Prometheus-like relabeling before sending the collected data to remote storage.
it can remove unwanted samples via Prometheus-like relabeling before sending the collected data to remote storage.
See [these docs](#relabeling) for details.
#### Splitting data streams among multiple systems
`vmagent` supports splitting of the collected data among muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
`vmagent` supports splitting the collected data between muliple destinations with the help of `-remoteWrite.urlRelabelConfig`,
which is applied independently for each configured `-remoteWrite.url` destination. For instance, it is possible to replicate or split
data among long-term remote storage, short-term remote storage and real-time analytical system [built on top of Kafka](https://github.com/Telefonica/prometheus-kafka-adapter).
Note that each destination can receive its own subset of the collected data thanks to per-destination relabeling via `-remoteWrite.urlRelabelConfig`.
@@ -148,6 +147,10 @@ The following scrape types in [scrape_config](https://prometheus.io/docs/prometh
* `dns_sd_configs` - for scraping targets discovered from DNS records (SRV, A and AAAA).
See [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config) for details.
Note that `vmagent` doesn't support `refresh_interval` option these scrape configs. Use the corresponding `-promscrape.*CheckInterval`
command-line flag instead. For example, `-promscrape.consulSDCheckInterval=60s` sets `refresh_interval` for all the `consul_sd_configs`
entries to 60s. Run `vmagent -help` in order to see default values for `-promscrape.*CheckInterval` flags.
File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
@@ -200,12 +203,12 @@ either via `vmagent` itself or via Prometheus, so the exported metrics could be
* When `vmagent` scrapes many unreliable targets, it can flood error log with scrape errors. These errors can be suppressed
by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`.
* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
* It is recommended to increase `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
and `vmagent_remotewrite_pending_data_bytes` metric exported at `http://vmagent-host:8429/metrics` page constantly grows.
* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
The directory can grow big when remote storage is unavailable during extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
The directory can grow large when remote storage is unavailable for extended periods of time and if `-remoteWrite.maxDiskUsagePerURL` isn't set.
If you don't want to send all the data from the directory to remote storage, simply stop `vmagent` and delete the directory.
### How to build from sources

View File

@@ -13,7 +13,7 @@ sends alerts to [Alert Manager](https://github.com/prometheus/alertmanager).
* Lightweight without extra dependencies.
### TODO:
* Configuration hot reload.
* Support recording rules.
### QuickStart
@@ -48,10 +48,11 @@ Rules in group evaluated one-by-one sequentially.
* `http://<vmalert-addr>/api/v1/<groupName>/<alertID>/status" ` - get alert status by ID.
Used as alert source in AlertManager.
* `http://<vmalert-addr>/metrics` - application metrics.
* `http://<vmalert-addr>/-/reload` - hot configuration reload.
`vmalert` may be configured with `-remotewrite` flag to write alerts state in form of timeseries
`vmalert` may be configured with `-remoteWrite` flag to write alerts state in form of timeseries
via remote write protocol. Alerts state will be written as `ALERTS` timeseries. These timeseries
may be used to recover alerts state on `vmalert` restarts if `-remoteread` is configured.
may be used to recover alerts state on `vmalert` restarts if `-remoteRead` is configured.
### Configuration
@@ -81,19 +82,21 @@ Usage of vmalert:
Address to listen for http connections (default ":8880")
-notifier.url string
Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093
-remoteread.basicAuth.password string
Optional basic auth password for -remoteread.url
-remoteread.basicAuth.username string
Optional basic auth username for -remoteread.url
-remoteread.lookback duration
-remoteRead.basicAuth.password string
Optional basic auth password for -remoteRead.url
-remoteRead.basicAuth.username string
Optional basic auth username for -remoteRead.url
-remoteRead.lookback duration
Lookback defines how far to look into past for alerts timeseries. For example, if lookback=1h then range from now() to now()-1h will be scanned. (default 1h0m0s)
-remoteread.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remotewrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remotewrite.basicAuth.password string
Optional basic auth password for -remotewrite.url
-remotewrite.basicAuth.username string
Optional basic auth username for -remotewrite.url
-remotewrite.url string
-remoteRead.url vmalert
Optional URL to Victoria Metrics or VMSelect that will be used to restore alerts state. This configuration makes sense only if vmalert was configured with `remoteWrite.url` before and has been successfully persisted its state. E.g. http://127.0.0.1:8428
-remoteWrite.basicAuth.password string
Optional basic auth password for -remoteWrite.url
-remoteWrite.basicAuth.username string
Optional basic auth username for -remoteWrite.url
-remoteWrite.maxQueueSize
Defines the max number of pending datapoints to remote write endpoint
-remoteWrite.url string
Optional URL to Victoria Metrics or VMInsert where to persist alerts state in form of timeseries. E.g. http://127.0.0.1:8428
-rule value
Path to the file with alert rules.
@@ -109,8 +112,30 @@ Usage of vmalert:
Pass `-help` to `vmalert` in order to see the full list of supported
command-line flags with their descriptions.
To reload configuration without `vmalert` restart send SIGHUP signal
or send GET request to `/-/reload` endpoint.
### Contributing
`vmalert` is mostly designed and built by VictoriaMetrics community.
Feel free to share your experience and ideas for improving this
software. Please keep simplicity as the main priority.
### How to build from sources
It is recommended using
[binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases)
- `vmalert` is located in `vmutils-*` archives there.
#### Development build
1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.13.
2. Run `make vmalert` from the root folder of the repository.
It builds `vmalert` binary and puts it into the `bin` folder.
#### Production build
1. [Install docker](https://docs.docker.com/install/).
2. Run `make vmalert-prod` from the root folder of the repository.
It builds `vmalert-prod` binary and puts it into the `bin` folder.

View File

@@ -19,7 +19,7 @@ The port can be modified via `-httpListenAddr` command-line flag.
The auth config can be reloaded by passing `SIGHUP` signal to `vmauth`.
Docker images for `vmauth` are available at [https://hub.docker.com/r/victoriametrics/vmauth/tags].
Docker images for `vmauth` are available [here](https://hub.docker.com/r/victoriametrics/vmauth/tags).
Pass `-help` to `vmauth` in order to see all the supported command-line flags with their descriptions.

View File

@@ -19,6 +19,9 @@ Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetri
See [this article](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883) for more details.
See also [vmbackuper](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/466) tool built on top of `vmbackup`. This tool simplifies
creation of hourly, daily, weekly and monthly backups.
### Use cases

View File

@@ -21,7 +21,8 @@ vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/r
* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup,
i.e. the end result would be similar to [rsync --delete](https://askubuntu.com/questions/476041/how-do-i-make-rsync-delete-files-that-have-been-deleted-from-the-source-folder).
### Troubleshooting
@@ -52,7 +53,7 @@ Run `vmrestore -help` in order to see all the available options:
-envflag.prefix string
Prefix for environment variables if -envflag.enable is set
-fs.disableMmap
Whether to use pread() instead of mmap() for reading data files
Whether to use pread() instead of mmap() for reading data files. By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory
-loggerFormat string
Format for logs. Possible values: default, json (default "default")
-loggerLevel string
@@ -68,8 +69,8 @@ Run `vmrestore -help` in order to see all the available options:
-src string
Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
-storageDataPath string
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
-version
Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case the contents of -storageDataPath dir is synchronized with -src contents, i.e. it works like 'rsync --delete' (default "victoria-metrics-data")
-version
Show VictoriaMetrics version
```

26
go.mod
View File

@@ -1,17 +1,17 @@
module github.com/VictoriaMetrics/VictoriaMetrics
require (
cloud.google.com/go/storage v1.6.0
cloud.google.com/go/storage v1.8.0
github.com/VictoriaMetrics/fastcache v1.5.7
// Do not use the original github.com/valyala/fasthttp because of issues
// like https://github.com/valyala/fasthttp/commit/996610f021ff45fdc98c2ce7884d5fa4e7f9199b
github.com/VictoriaMetrics/fasthttp v1.0.1
github.com/VictoriaMetrics/metrics v1.11.2
github.com/VictoriaMetrics/metricsql v0.1.0
github.com/aws/aws-sdk-go v1.30.20
github.com/VictoriaMetrics/metrics v1.11.3
github.com/VictoriaMetrics/metricsql v0.2.3
github.com/aws/aws-sdk-go v1.30.28
github.com/cespare/xxhash/v2 v2.1.1
github.com/golang/protobuf v1.4.1 // indirect
github.com/golang/protobuf v1.4.2 // indirect
github.com/golang/snappy v0.0.1
github.com/klauspost/compress v1.10.5
github.com/valyala/fastjson v1.5.1
@@ -19,15 +19,15 @@ require (
github.com/valyala/gozstd v1.7.0
github.com/valyala/histogram v1.0.1
github.com/valyala/quicktemplate v1.5.0
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 // indirect
golang.org/x/mod v0.3.0 // indirect
golang.org/x/net v0.0.0-20200513185701-a91f0712d120 // indirect
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3
golang.org/x/tools v0.0.0-20200504193531-9bfbc385433f // indirect
google.golang.org/api v0.23.0
google.golang.org/appengine v1.6.6 // indirect
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84 // indirect
google.golang.org/grpc v1.29.1 // indirect
gopkg.in/yaml.v2 v2.2.8
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d // indirect
google.golang.org/api v0.24.0
google.golang.org/genproto v0.0.0-20200514193133-8feb7f20f2a2 // indirect
gopkg.in/yaml.v2 v2.3.0
honnef.co/go/tools v0.0.1-2020.1.4 // indirect
)
go 1.13

70
go.sum
View File

@@ -8,12 +8,18 @@ cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg
cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To=
cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4=
cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M=
cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc=
cloud.google.com/go v0.56.0 h1:WRz29PgAsVEyPSDHyk+0fpEkwEFyfhHn+JbksT6gIL4=
cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk=
cloud.google.com/go v0.57.0 h1:EpMNVUorLiZIELdMZbCYX/ByTFCdoYopYAGxaGVz9ms=
cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs=
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
cloud.google.com/go/bigquery v1.4.0 h1:xE3CPsOgttP4ACBePh79zTKALtXwn/Edhcr16R5hMWU=
cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
cloud.google.com/go/bigquery v1.7.0 h1:a/O/bK/vWrYGOTFtH8di4rBxMZnmkjy+Y5LxpDwo+dA=
cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/datastore v1.1.0 h1:/May9ojXjRkPBNVrq+oWLqmWCkr4OU5uRY29bu0mRyQ=
cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
@@ -21,10 +27,14 @@ cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2k
cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
cloud.google.com/go/pubsub v1.2.0 h1:Lpy6hKgdcl7a3WGSfJIFmxmcdjSpP6OmBEfcOv1Y680=
cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
cloud.google.com/go/pubsub v1.3.1 h1:ukjixP1wl0LpnZ6LWtZJ0mX5tBmjp1f8Sqer8Z2OMUU=
cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
cloud.google.com/go/storage v1.6.0 h1:UDpwYIwla4jHGzZJaEJYx1tOejbgSoNqsAfHAUYe2r8=
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
cloud.google.com/go/storage v1.8.0 h1:86K1Gel7BQ9/WmNWn7dTKMvTLFzwtBe5FNqYbi9X35g=
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
@@ -35,12 +45,14 @@ github.com/VictoriaMetrics/fasthttp v1.0.1 h1:I7YdbswTIW63WxoFoUOSNxeOEGB46rdKUL
github.com/VictoriaMetrics/fasthttp v1.0.1/go.mod h1:BqgsieH90PR7x97c89j+eqZDloKkDhAEQTwhLw6jw/4=
github.com/VictoriaMetrics/metrics v1.11.2 h1:t/ceLP6SvagUqypCKU7cI7+tQn54+TIV/tGoxihHvx8=
github.com/VictoriaMetrics/metrics v1.11.2/go.mod h1:LU2j9qq7xqZYXz8tF3/RQnB2z2MbZms5TDiIg9/NHiQ=
github.com/VictoriaMetrics/metricsql v0.1.0 h1:IoyG84PCwFY15rNuxpr2nQ+YZBYIhnd7zTiaGo5BNpc=
github.com/VictoriaMetrics/metricsql v0.1.0/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
github.com/VictoriaMetrics/metrics v1.11.3 h1:eSfXc0CrquKa1VTNUvhP+dhNjLUZHQGTFfp19mYCQWE=
github.com/VictoriaMetrics/metrics v1.11.3/go.mod h1:LU2j9qq7xqZYXz8tF3/RQnB2z2MbZms5TDiIg9/NHiQ=
github.com/VictoriaMetrics/metricsql v0.2.3 h1:xGscDmLoeIV7+8qX/mdHnOY0vu4m+wHIVGMoy/nBovY=
github.com/VictoriaMetrics/metricsql v0.2.3/go.mod h1:UIjd9S0W1UnTWlJdM0wLS+2pfuPqjwqKoK8yTos+WyE=
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
github.com/aws/aws-sdk-go v1.30.20 h1:ktsy2vodSZxz/arYqo7DlpkIeNohHL+4Rmjdo7YGtrE=
github.com/aws/aws-sdk-go v1.30.20/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
github.com/aws/aws-sdk-go v1.30.28 h1:SaPM7dlmp7h3Lj1nJ4jdzOkTdom08+g20k7AU5heZYg=
github.com/aws/aws-sdk-go v1.30.28/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -69,11 +81,13 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
@@ -83,6 +97,8 @@ github.com/golang/protobuf v1.4.0 h1:oOuy+ugB+P/kBdUnG5QaMXSIyJ1q38wWSojYCb3z5VQ
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.1 h1:ZFgWrT+bLgsYPirOnRfKLYJLvssAegOj/hgyMFdJZe0=
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
@@ -99,6 +115,7 @@ github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OI
github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
@@ -186,6 +203,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -202,9 +221,14 @@ golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5 h1:WQ8q63x+f/zpC8Ac1s9wLElVoHhm32p6tudrU72n1QA=
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f h1:QBjCr1Fz5kw158VqdE9JfI9cJnl/ymnJWAdMuinqL7Y=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120 h1:EZ3cVSzKOlJxAd8e8YAJ7no8nNypTxexh/YE/xW3ZEY=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -236,10 +260,14 @@ golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3 h1:5B6i6EAiSYyejWfvc5Rc9BbI3rzIsrrXfAQBWnYfn+w=
golang.org/x/sys v0.0.0-20200501145240-bc7a7d42d5c3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25 h1:OKbAoGs4fGM5cPLlVQLZGYkFC8OnOfgo6tt0Smf9XhM=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9 h1:YTzHMGlqJu67/uEo1lBv0n3wBXhXNeUbB1XfN2vmTm0=
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -277,9 +305,14 @@ golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapK
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8=
golang.org/x/tools v0.0.0-20200504193531-9bfbc385433f h1:sPpKxOcxUWPeIIqQbn06sX/NBtDl45ZGdi67lZECBsw=
golang.org/x/tools v0.0.0-20200504193531-9bfbc385433f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d h1:n6zwymXmN9rCClNNmCWwV3qwMmBcRw/WeIGDK8Qnzk4=
golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
@@ -293,9 +326,11 @@ google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb
google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.23.0 h1:YlvGEOq2NA2my8cZ/9V8BcEO9okD48FlJcdqN0xJL3s=
google.golang.org/api v0.23.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
google.golang.org/api v0.24.0 h1:cG03eaksBzhfSIk7JRGctfp3lanklcOM/mTGvow7BbQ=
google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
@@ -320,9 +355,16 @@ google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvx
google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA=
google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84 h1:pSLkPbrjnPyLDYUO2VM9mDLqo2V6CFBY84lFSZAfoi4=
google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380 h1:xriR1EgvKfkKxIoU2uUvrMVl+H26359loFFUleSMXFo=
google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
google.golang.org/genproto v0.0.0-20200514193133-8feb7f20f2a2 h1:RwW6+LxyOQJ7oeoZ76GIJlwt/O0J5cN2fk+q/jK27kQ=
google.golang.org/genproto v0.0.0-20200514193133-8feb7f20f2a2/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
@@ -342,13 +384,15 @@ google.golang.org/protobuf v1.21.0 h1:qdOKuR/EIArgaWNjetjgTzgVTAZ+S/WXVrq9HW9zim
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0 h1:cJv5/xdbk1NnMPR1VP9+HU6gupuG9MLBoH1r6RHZ2MY=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
@@ -356,6 +400,8 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
honnef.co/go/tools v0.0.1-2020.1.3 h1:sXmLre5bzIR6ypkjXCDI3jHPssRhc8KD/Ome589sc3U=
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4 h1:UoveltGrhghAA7ePc+e+QYDHXrBps2PqFZiHkGR/xK8=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=

View File

@@ -113,10 +113,16 @@ func (r *Restore) Run() error {
partsToDelete := common.PartsDifference(dstParts, srcParts)
deleteSize := uint64(0)
if len(partsToDelete) > 0 {
// Fully remove local file if certain parts from the remote part are missing.
// Remove only files with the missing part at offset 0.
// Assume other files are partially downloaded during the previous Restore.Run call,
// so only the last part in them may be incomplete.
// The last part for partially downloaded files will be re-downloaded later.
// This addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/487 .
pathsToDelete := make(map[string]bool)
for _, p := range partsToDelete {
pathsToDelete[p.Path] = true
if p.Offset == 0 {
pathsToDelete[p.Path] = true
}
}
logger.Infof("deleting %d files from %s", len(pathsToDelete), dst)
for path := range pathsToDelete {
@@ -153,7 +159,8 @@ func (r *Restore) Run() error {
logger.Infof("downloading %d parts from %s to %s", len(partsToCopy), src, dst)
bytesDownloaded := uint64(0)
err = runParallelPerPath(concurrency, perPath, func(parts []common.Part) error {
// Sort partsToCopy in order to properly grow file size during downloading.
// Sort partsToCopy in order to properly grow file size during downloading
// and to properly resume downloading of incomplete files on the next Restore.Run call.
common.SortParts(parts)
for _, p := range parts {
logger.Infof("downloading %s from %s to %s", &p, src, dst)
@@ -169,7 +176,7 @@ func (r *Restore) Run() error {
return fmt.Errorf("cannot download %s to %s: %s", &p, dst, err)
}
if err := wc.Close(); err != nil {
return fmt.Errorf("cannot close reader fro %s from %s: %s", &p, src, err)
return fmt.Errorf("cannot close reader from %s from %s: %s", &p, src, err)
}
}
return nil

View File

@@ -230,6 +230,6 @@ func NewRemoteFS(path string) (common.RemoteFS, error) {
}
return fs, nil
default:
return nil, fmt.Errorf("unsupported scheme %q in `-dst`", scheme)
return nil, fmt.Errorf("unsupported scheme %q", scheme)
}
}

40
lib/fasttime/fasttime.go Normal file
View File

@@ -0,0 +1,40 @@
package fasttime
import (
"sync/atomic"
"time"
)
func init() {
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for tm := range ticker.C {
t := uint64(tm.Unix())
atomic.StoreUint64(&currentTimestamp, t)
}
}()
}
var currentTimestamp = uint64(time.Now().Unix())
// UnixTimestamp returns the current unix timestamp in seconds.
//
// It is faster than time.Now().Unix()
func UnixTimestamp() uint64 {
return atomic.LoadUint64(&currentTimestamp)
}
// UnixDate returns date from the current unix timestamp.
//
// The date is calculated by dividing unix timestamp by (24*3600)
func UnixDate() uint64 {
return UnixTimestamp() / (24 * 3600)
}
// UnixHour returns hour from the current unix timestamp.
//
// The hour is calculated by dividing unix timestamp by 3600
func UnixHour() uint64 {
return UnixTimestamp() / 3600
}

View File

@@ -0,0 +1,30 @@
package fasttime
import (
"testing"
"time"
)
func TestUnixTimestamp(t *testing.T) {
tsExpected := uint64(time.Now().Unix())
ts := UnixTimestamp()
if ts-tsExpected > 1 {
t.Fatalf("unexpected UnixTimestamp; got %d; want %d", ts, tsExpected)
}
}
func TestUnixDate(t *testing.T) {
dateExpected := uint64(time.Now().Unix() / (24 * 3600))
date := UnixDate()
if date-dateExpected > 1 {
t.Fatalf("unexpected UnixDate; got %d; want %d", date, dateExpected)
}
}
func TestUnixHour(t *testing.T) {
hourExpected := uint64(time.Now().Unix() / 3600)
hour := UnixHour()
if hour-hourExpected > 1 {
t.Fatalf("unexpected UnixHour; got %d; want %d", hour, hourExpected)
}
}

View File

@@ -0,0 +1,32 @@
package fasttime
import (
"sync/atomic"
"testing"
"time"
)
func BenchmarkUnixTimestamp(b *testing.B) {
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var ts uint64
for pb.Next() {
ts += UnixTimestamp()
}
atomic.StoreUint64(&Sink, ts)
})
}
func BenchmarkTimeNowUnix(b *testing.B) {
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
var ts uint64
for pb.Next() {
ts += uint64(time.Now().Unix())
}
atomic.StoreUint64(&Sink, ts)
})
}
// Sink should prevent from code elimination by optimizing compiler
var Sink uint64

View File

@@ -10,7 +10,10 @@ import (
"golang.org/x/sys/unix"
)
var disableMmap = flag.Bool("fs.disableMmap", false, "Whether to use pread() instead of mmap() for reading data files")
var disableMmap = flag.Bool("fs.disableMmap", is32BitPtr, "Whether to use pread() instead of mmap() for reading data files. "+
"By default mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot data files bigger than 2^32 bytes in memory")
const is32BitPtr = (^uintptr(0) >> 32) == 0
// MustReadAtCloser is rand-access read interface.
type MustReadAtCloser interface {

View File

@@ -148,18 +148,18 @@ func unmarshalBlockHeaders(dst []blockHeader, src []byte, blockHeadersCount int)
for i := 0; i < blockHeadersCount; i++ {
tail, err := dst[dstLen+i].Unmarshal(src)
if err != nil {
return nil, fmt.Errorf("cannot unmarshal block header: %s", err)
return dst, fmt.Errorf("cannot unmarshal block header: %s", err)
}
src = tail
}
if len(src) > 0 {
return nil, fmt.Errorf("unexpected non-zero tail left after unmarshaling %d block headers; len(tail)=%d", blockHeadersCount, len(src))
return dst, fmt.Errorf("unexpected non-zero tail left after unmarshaling %d block headers; len(tail)=%d", blockHeadersCount, len(src))
}
newBHS := dst[dstLen:]
// Verify that block headers are sorted by firstItem.
if !sort.SliceIsSorted(newBHS, func(i, j int) bool { return string(newBHS[i].firstItem) < string(newBHS[j].firstItem) }) {
return nil, fmt.Errorf("block headers must be sorted by firstItem; unmarshaled unsorted block headers: %#v", newBHS)
return dst, fmt.Errorf("block headers must be sorted by firstItem; unmarshaled unsorted block headers: %#v", newBHS)
}
return dst, nil

View File

@@ -63,9 +63,12 @@ func (bsw *blockStreamWriter) reset() {
bsw.mrFirstItemCaught = false
}
func (bsw *blockStreamWriter) InitFromInmemoryPart(ip *inmemoryPart, compressLevel int) {
func (bsw *blockStreamWriter) InitFromInmemoryPart(ip *inmemoryPart) {
bsw.reset()
bsw.compressLevel = compressLevel
// Use the minimum compression level for in-memory blocks,
// since they are going to be re-compressed during the merge into file-based blocks.
bsw.compressLevel = -5 // See https://github.com/facebook/zstd/releases/tag/v1.3.4
bsw.metaindexWriter = &ip.metaindexData
bsw.indexWriter = &ip.indexData

View File

@@ -151,7 +151,7 @@ var isInTest = func() bool {
return strings.HasSuffix(os.Args[0], ".test")
}()
// MarshalUnsortedData marshals sorted items from ib to sb.
// MarshalSortedData marshals sorted items from ib to sb.
//
// It also:
// - appends first item to firstItemDst and returns the result.

View File

@@ -29,14 +29,14 @@ func TestMultilevelMerge(t *testing.T) {
// First level merge
var dstIP1 inmemoryPart
var bsw1 blockStreamWriter
bsw1.InitFromInmemoryPart(&dstIP1, 0)
bsw1.InitFromInmemoryPart(&dstIP1)
if err := mergeBlockStreams(&dstIP1.ph, &bsw1, bsrs[:5], nil, nil, &itemsMerged); err != nil {
t.Fatalf("cannot merge first level part 1: %s", err)
}
var dstIP2 inmemoryPart
var bsw2 blockStreamWriter
bsw2.InitFromInmemoryPart(&dstIP2, 0)
bsw2.InitFromInmemoryPart(&dstIP2)
if err := mergeBlockStreams(&dstIP2.ph, &bsw2, bsrs[5:], nil, nil, &itemsMerged); err != nil {
t.Fatalf("cannot merge first level part 2: %s", err)
}
@@ -53,7 +53,7 @@ func TestMultilevelMerge(t *testing.T) {
newTestBlockStreamReader(&dstIP1),
newTestBlockStreamReader(&dstIP2),
}
bsw.InitFromInmemoryPart(&dstIP, 0)
bsw.InitFromInmemoryPart(&dstIP)
if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrsTop, nil, nil, &itemsMerged); err != nil {
t.Fatalf("cannot merge second level: %s", err)
}
@@ -72,7 +72,7 @@ func TestMergeForciblyStop(t *testing.T) {
bsrs, _ := newTestInmemoryBlockStreamReaders(20, 4000)
var dstIP inmemoryPart
var bsw blockStreamWriter
bsw.InitFromInmemoryPart(&dstIP, 0)
bsw.InitFromInmemoryPart(&dstIP)
ch := make(chan struct{})
var itemsMerged uint64
close(ch)
@@ -119,7 +119,7 @@ func testMergeBlockStreamsSerial(blocksToMerge, maxItemsPerBlock int) error {
var itemsMerged uint64
var dstIP inmemoryPart
var bsw blockStreamWriter
bsw.InitFromInmemoryPart(&dstIP, 0)
bsw.InitFromInmemoryPart(&dstIP)
if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, nil, nil, &itemsMerged); err != nil {
return fmt.Errorf("cannot merge block streams: %s", err)
}

View File

@@ -7,6 +7,7 @@ import (
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -227,7 +228,7 @@ func (idxbc *indexBlockCache) cleaner() {
}
func (idxbc *indexBlockCache) cleanByTimeout() {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
idxbc.mu.Lock()
for k, idxbe := range idxbc.m {
// Delete items accessed more than 10 minutes ago.
@@ -245,7 +246,7 @@ func (idxbc *indexBlockCache) Get(k uint64) *indexBlock {
idxbc.mu.RUnlock()
if idxbe != nil {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
if atomic.LoadUint64(&idxbe.lastAccessTime) != currentTime {
atomic.StoreUint64(&idxbe.lastAccessTime, currentTime)
}
@@ -256,9 +257,7 @@ func (idxbc *indexBlockCache) Get(k uint64) *indexBlock {
}
// Put puts idxb under the key k into idxbc.
//
// Returns true if the idxb has been put into idxbc.
func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) bool {
func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) {
idxbc.mu.Lock()
// Remove superflouos entries.
@@ -278,12 +277,11 @@ func (idxbc *indexBlockCache) Put(k uint64, idxb *indexBlock) bool {
// Store idxb in the cache.
idxbe := &indexBlockCacheEntry{
lastAccessTime: atomic.LoadUint64(&currentTimestamp),
lastAccessTime: fasttime.UnixTimestamp(),
idxb: idxb,
}
idxbc.m[k] = idxbe
idxbc.mu.Unlock()
return true
}
func (idxbc *indexBlockCache) Len() uint64 {
@@ -377,7 +375,7 @@ func (ibc *inmemoryBlockCache) cleaner() {
}
func (ibc *inmemoryBlockCache) cleanByTimeout() {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
ibc.mu.Lock()
for k, ibe := range ibc.m {
// Delete items accessed more than 10 minutes ago.
@@ -396,7 +394,7 @@ func (ibc *inmemoryBlockCache) Get(k inmemoryBlockCacheKey) *inmemoryBlock {
ibc.mu.RUnlock()
if ibe != nil {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
if atomic.LoadUint64(&ibe.lastAccessTime) != currentTime {
atomic.StoreUint64(&ibe.lastAccessTime, currentTime)
}
@@ -407,9 +405,7 @@ func (ibc *inmemoryBlockCache) Get(k inmemoryBlockCacheKey) *inmemoryBlock {
}
// Put puts ib under key k into ibc.
//
// Returns true if ib was put into ibc.
func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) bool {
func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) {
ibc.mu.Lock()
// Clean superflouos entries in cache.
@@ -429,12 +425,11 @@ func (ibc *inmemoryBlockCache) Put(k inmemoryBlockCacheKey, ib *inmemoryBlock) b
// Store ib in the cache.
ibe := &inmemoryBlockCacheEntry{
lastAccessTime: atomic.LoadUint64(&currentTimestamp),
lastAccessTime: fasttime.UnixTimestamp(),
ib: ib,
}
ibc.m[k] = ibe
ibc.mu.Unlock()
return true
}
func (ibc *inmemoryBlockCache) Len() uint64 {
@@ -451,16 +446,3 @@ func (ibc *inmemoryBlockCache) Requests() uint64 {
func (ibc *inmemoryBlockCache) Misses() uint64 {
return atomic.LoadUint64(&ibc.misses)
}
func init() {
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for tm := range ticker.C {
t := uint64(tm.Unix())
atomic.StoreUint64(&currentTimestamp, t)
}
}()
}
var currentTimestamp uint64

View File

@@ -25,9 +25,6 @@ type partSearch struct {
// The remaining block headers to scan in the current metaindexRow.
bhs []blockHeader
// Pointer to index block, which may be reused.
indexBlockReuse *indexBlock
// Pointer to inmemory block, which may be reused.
inmemoryBlockReuse *inmemoryBlock
@@ -53,10 +50,6 @@ func (ps *partSearch) reset() {
ps.p = nil
ps.mrs = nil
ps.bhs = nil
if ps.indexBlockReuse != nil {
putIndexBlock(ps.indexBlockReuse)
ps.indexBlockReuse = nil
}
if ps.inmemoryBlockReuse != nil {
putInmemoryBlock(ps.inmemoryBlockReuse)
ps.inmemoryBlockReuse = nil
@@ -275,40 +268,25 @@ func (ps *partSearch) nextBlock() error {
}
func (ps *partSearch) nextBHS() error {
if ps.indexBlockReuse != nil {
putIndexBlock(ps.indexBlockReuse)
ps.indexBlockReuse = nil
}
if len(ps.mrs) == 0 {
return io.EOF
}
mr := &ps.mrs[0]
ps.mrs = ps.mrs[1:]
idxb, mayReuseIndexBlock, err := ps.getIndexBlock(mr)
if err != nil {
return fmt.Errorf("cannot get index block: %s", err)
}
if mayReuseIndexBlock {
ps.indexBlockReuse = idxb
idxbKey := mr.indexBlockOffset
idxb := ps.idxbCache.Get(idxbKey)
if idxb == nil {
var err error
idxb, err = ps.readIndexBlock(mr)
if err != nil {
return fmt.Errorf("cannot read index block: %s", err)
}
ps.idxbCache.Put(idxbKey, idxb)
}
ps.bhs = idxb.bhs
return nil
}
func (ps *partSearch) getIndexBlock(mr *metaindexRow) (*indexBlock, bool, error) {
idxbKey := mr.indexBlockOffset
idxb := ps.idxbCache.Get(idxbKey)
if idxb != nil {
return idxb, false, nil
}
idxb, err := ps.readIndexBlock(mr)
if err != nil {
return nil, false, err
}
ok := ps.idxbCache.Put(idxbKey, idxb)
return idxb, !ok, nil
}
func (ps *partSearch) readIndexBlock(mr *metaindexRow) (*indexBlock, error) {
ps.compressedIndexBuf = bytesutil.Resize(ps.compressedIndexBuf, int(mr.indexBlockSize))
ps.p.indexFile.MustReadAt(ps.compressedIndexBuf, int64(mr.indexBlockOffset))
@@ -347,8 +325,8 @@ func (ps *partSearch) getInmemoryBlock(bh *blockHeader) (*inmemoryBlock, bool, e
if err != nil {
return nil, false, err
}
ok := ps.ibCache.Put(ibKey, ib)
return ib, !ok, nil
ps.ibCache.Put(ibKey, ib)
return ib, false, nil
}
func (ps *partSearch) readInmemoryBlock(bh *blockHeader) (*inmemoryBlock, error) {

View File

@@ -149,7 +149,7 @@ func newTestPart(blocksCount, maxItemsPerBlock int) (*part, []string, error) {
var itemsMerged uint64
var ip inmemoryPart
var bsw blockStreamWriter
bsw.InitFromInmemoryPart(&ip, 0)
bsw.InitFromInmemoryPart(&ip)
if err := mergeBlockStreams(&ip.ph, &bsw, bsrs, nil, nil, &itemsMerged); err != nil {
return nil, nil, fmt.Errorf("cannot merge blocks: %s", err)
}

View File

@@ -13,6 +13,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -102,7 +103,7 @@ type Table struct {
rawItemsBlocks []*inmemoryBlock
rawItemsLock sync.Mutex
rawItemsLastFlushTime time.Time
rawItemsLastFlushTime uint64
snapshotLock sync.RWMutex
@@ -369,7 +370,7 @@ func (tb *Table) AddItems(items [][]byte) error {
if len(tb.rawItemsBlocks) >= 1024 {
blocksToMerge = tb.rawItemsBlocks
tb.rawItemsBlocks = nil
tb.rawItemsLastFlushTime = time.Now()
tb.rawItemsLastFlushTime = fasttime.UnixTimestamp()
}
tb.rawItemsLock.Unlock()
@@ -508,11 +509,15 @@ func (tb *Table) flushRawItems(isFinal bool) {
defer tb.rawItemsPendingFlushesWG.Done()
mustFlush := false
currentTime := time.Now()
currentTime := fasttime.UnixTimestamp()
flushSeconds := int64(rawItemsFlushInterval.Seconds())
if flushSeconds <= 0 {
flushSeconds = 1
}
var blocksToMerge []*inmemoryBlock
tb.rawItemsLock.Lock()
if isFinal || currentTime.Sub(tb.rawItemsLastFlushTime) > rawItemsFlushInterval {
if isFinal || currentTime-tb.rawItemsLastFlushTime > uint64(flushSeconds) {
mustFlush = true
blocksToMerge = tb.rawItemsBlocks
tb.rawItemsBlocks = nil
@@ -619,9 +624,8 @@ func (tb *Table) mergeInmemoryBlocks(blocksToMerge []*inmemoryBlock) *partWrappe
// Prepare blockStreamWriter for destination part.
bsw := getBlockStreamWriter()
compressLevel := 1
mpDst := getInmemoryPart()
bsw.InitFromInmemoryPart(mpDst, compressLevel)
bsw.InitFromInmemoryPart(mpDst)
// Merge parts.
// The merge shouldn't be interrupted by stopCh,
@@ -674,7 +678,7 @@ const (
func (tb *Table) partMerger() error {
sleepTime := minMergeSleepTime
var lastMergeTime time.Time
var lastMergeTime uint64
isFinal := false
t := time.NewTimer(sleepTime)
for {
@@ -682,7 +686,7 @@ func (tb *Table) partMerger() error {
if err == nil {
// Try merging additional parts.
sleepTime = minMergeSleepTime
lastMergeTime = time.Now()
lastMergeTime = fasttime.UnixTimestamp()
isFinal = false
continue
}
@@ -693,10 +697,10 @@ func (tb *Table) partMerger() error {
if err != errNothingToMerge {
return err
}
if time.Since(lastMergeTime) > 30*time.Second {
if fasttime.UnixTimestamp()-lastMergeTime > 30 {
// We have free time for merging into bigger parts.
// This should improve select performance.
lastMergeTime = time.Now()
lastMergeTime = fasttime.UnixTimestamp()
isFinal = true
continue
}
@@ -764,8 +768,10 @@ func (tb *Table) mergeParts(pws []*partWrapper, stopCh <-chan struct{}, isOuterP
}
outItemsCount := uint64(0)
outBlocksCount := uint64(0)
for _, pw := range pws {
outItemsCount += pw.p.ph.itemsCount
outBlocksCount += pw.p.ph.blocksCount
}
nocache := true
if outItemsCount < maxItemsPerCachedPart() {
@@ -778,7 +784,7 @@ func (tb *Table) mergeParts(pws []*partWrapper, stopCh <-chan struct{}, isOuterP
mergeIdx := tb.nextMergeIdx()
tmpPartPath := fmt.Sprintf("%s/tmp/%016X", tb.path, mergeIdx)
bsw := getBlockStreamWriter()
compressLevel := getCompressLevelForPartItems(outItemsCount)
compressLevel := getCompressLevelForPartItems(outItemsCount, outBlocksCount)
if err := bsw.InitFromFilePart(tmpPartPath, nocache, compressLevel); err != nil {
return fmt.Errorf("cannot create destination part %q: %s", tmpPartPath, err)
}
@@ -863,27 +869,43 @@ func (tb *Table) mergeParts(pws []*partWrapper, stopCh <-chan struct{}, isOuterP
d := time.Since(startTime)
if d > 10*time.Second {
logger.Infof("merged %d items in %.3f seconds at %d items/sec to %q; sizeBytes: %d",
outItemsCount, d.Seconds(), int(float64(outItemsCount)/d.Seconds()), dstPartPath, newPSize)
logger.Infof("merged %d items across %d blocks in %.3f seconds at %d items/sec to %q; sizeBytes: %d",
outItemsCount, outBlocksCount, d.Seconds(), int(float64(outItemsCount)/d.Seconds()), dstPartPath, newPSize)
}
return nil
}
func getCompressLevelForPartItems(itemsCount uint64) int {
if itemsCount < 1<<19 {
func getCompressLevelForPartItems(itemsCount, blocksCount uint64) int {
// There is no need in using blocksCount here, since mergeset blocks are usually full.
if itemsCount <= 1<<16 {
// -5 is the minimum supported compression for zstd.
// See https://github.com/facebook/zstd/releases/tag/v1.3.4
return -5
}
if itemsCount <= 1<<17 {
return -4
}
if itemsCount <= 1<<18 {
return -3
}
if itemsCount <= 1<<19 {
return -2
}
if itemsCount <= 1<<20 {
return -1
}
if itemsCount <= 1<<22 {
return 1
}
if itemsCount < 1<<22 {
if itemsCount <= 1<<25 {
return 2
}
if itemsCount < 1<<25 {
if itemsCount <= 1<<28 {
return 3
}
if itemsCount < 1<<28 {
return 4
}
return 5
return 4
}
func (tb *Table) nextMergeIdx() uint64 {
@@ -892,15 +914,15 @@ func (tb *Table) nextMergeIdx() uint64 {
var (
maxOutPartItemsLock sync.Mutex
maxOutPartItemsDeadline time.Time
maxOutPartItemsDeadline uint64
lastMaxOutPartItems uint64
)
func (tb *Table) maxOutPartItems() uint64 {
maxOutPartItemsLock.Lock()
if time.Until(maxOutPartItemsDeadline) < 0 {
if maxOutPartItemsDeadline < fasttime.UnixTimestamp() {
lastMaxOutPartItems = tb.maxOutPartItemsSlow()
maxOutPartItemsDeadline = time.Now().Add(time.Second)
maxOutPartItemsDeadline = fasttime.UnixTimestamp() + 2
}
n := lastMaxOutPartItems
maxOutPartItemsLock.Unlock()

View File

@@ -75,6 +75,8 @@ func (fq *FastQueue) flushInmemoryBlocksToFileLocked() {
fq.pendingInmemoryBytes -= uint64(len(bb.B))
blockBufPool.Put(bb)
}
// Unblock all the potentially blocked readers, so they could proceed with reading file-based queue.
fq.cond.Broadcast()
}
// GetPendingBytes returns the number of pending bytes in the fq.
@@ -120,10 +122,10 @@ func (fq *FastQueue) MustWriteBlock(block []byte) {
bb.B = append(bb.B[:0], block...)
fq.ch <- bb
fq.pendingInmemoryBytes += uint64(len(block))
if len(fq.ch) == 1 {
// Notify potentially blocked reader
fq.cond.Signal()
}
// Notify potentially blocked reader.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/484 for the context.
fq.cond.Signal()
}
// MustReadBlock reads the next block from fq to dst and returns it.
@@ -137,7 +139,7 @@ func (fq *FastQueue) MustReadBlock(dst []byte) ([]byte, bool) {
}
if len(fq.ch) > 0 {
if n := fq.pq.GetPendingBytes(); n > 0 {
logger.Panicf("BUG: the file-based queue must be empty when the inmemory queue is empty; it contains %d pending bytes", n)
logger.Panicf("BUG: the file-based queue must be empty when the inmemory queue is non-empty; it contains %d pending bytes", n)
}
bb := <-fq.ch
fq.pendingInmemoryBytes -= uint64(len(bb.B))

View File

@@ -383,13 +383,13 @@ func (q *Queue) MustWriteBlock(block []byte) {
return
}
}
mustNotifyReader := q.readerOffset == q.writerOffset
if err := q.writeBlockLocked(block); err != nil {
logger.Panicf("FATAL: %s", err)
}
if mustNotifyReader {
q.cond.Signal()
}
// Notify blocked reader if any.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/pull/484 for details.
q.cond.Signal()
}
var blockBufPool bytesutil.ByteBufferPool

View File

@@ -69,9 +69,8 @@ func (ac *Config) NewTLSConfig() *tls.Config {
ClientSessionCache: tls.NewLRUClientSessionCache(0),
}
if ac.TLSCertificate != nil {
tlsCfg.GetClientCertificate = func(*tls.CertificateRequestInfo) (*tls.Certificate, error) {
return ac.TLSCertificate, nil
}
// Do not set tlsCfg.GetClientCertificate, since tlsCfg.Certificates should work OK.
tlsCfg.Certificates = []tls.Certificate{*ac.TLSCertificate}
}
tlsCfg.ServerName = ac.TLSServerName
tlsCfg.InsecureSkipVerify = ac.TLSInsecureSkipVerify

View File

@@ -21,10 +21,15 @@ func MarshalWriteRequest(dst []byte, wr *WriteRequest) []byte {
// ResetWriteRequest resets wr.
func ResetWriteRequest(wr *WriteRequest) {
for i := range wr.Timeseries {
ts := wr.Timeseries[i]
wr.Timeseries = ResetTimeSeries(wr.Timeseries)
}
// ResetTimeSeries clears all the GC references from tss and returns an empty tss ready for further use.
func ResetTimeSeries(tss []TimeSeries) []TimeSeries {
for i := range tss {
ts := tss[i]
ts.Labels = nil
ts.Samples = nil
}
wr.Timeseries = wr.Timeseries[:0]
return tss[:0]
}

View File

@@ -7,7 +7,6 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/fasthttp"
"github.com/VictoriaMetrics/metrics"
)
@@ -49,7 +48,6 @@ func newClient(sw *ScrapeWork) *client {
Addr: host,
Name: "vm_promscrape",
Dial: statDial,
DialDualStack: netutil.TCP6Enabled(),
IsTLS: isTLS,
TLSConfig: tlsCfg,
MaxIdleConnDuration: 2 * sw.ScrapeInterval,

View File

@@ -5,6 +5,7 @@ import (
"fmt"
"io/ioutil"
"net/url"
"os"
"path/filepath"
"strings"
"sync/atomic"
@@ -25,6 +26,9 @@ import (
var (
strictParse = flag.Bool("promscrape.config.strictParse", false, "Whether to allow only supported fields in '-promscrape.config'. "+
"This option may be used for errors detection in '-promscrape.config' file")
dryRun = flag.Bool("promscrape.config.dryRun", false, "Checks -promscrape.config file for errors and unsupported fields and then exits. "+
"Returns non-zero exit code on parsing errors and emits these errors to stderr. "+
"Pass -loggerLevel=ERROR if you don't need to see info messages in the output")
)
// Config represents essential parts from Prometheus config defined at https://prometheus.io/docs/prometheus/latest/configuration/configuration/
@@ -114,6 +118,13 @@ func loadConfig(path string) (cfg *Config, data []byte, err error) {
if err := cfgObj.parse(data, path); err != nil {
return nil, nil, fmt.Errorf("cannot parse Prometheus config from %q: %s", path, err)
}
if *dryRun {
// This is a dirty hack for checking Prometheus config only.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/362
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/508 for details.
logger.Infof("Success: the config at %q has no errors; exitting with 0 status code", path)
os.Exit(0)
}
return &cfgObj, data, nil
}
@@ -139,7 +150,7 @@ func (cfg *Config) parse(data []byte, path string) error {
func unmarshalMaybeStrict(data []byte, dst interface{}) error {
var err error
if *strictParse {
if *strictParse || *dryRun {
err = yaml.UnmarshalStrict(data, dst)
} else {
err = yaml.Unmarshal(data, dst)

View File

@@ -50,7 +50,7 @@ func newAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) {
tlsConfig := promauth.TLSConfig{
CAFile: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt",
}
acNew, err := promauth.NewConfig("/", nil, "", "/var/run/secrets/kubernetes.io/serviceaccount/token", &tlsConfig)
acNew, err := promauth.NewConfig(".", nil, "", "/var/run/secrets/kubernetes.io/serviceaccount/token", &tlsConfig)
if err != nil {
return nil, fmt.Errorf("cannot initialize service account auth: %s; probably, `kubernetes_sd_config->api_server` is missing in Prometheus configs?", err)
}

View File

@@ -2,17 +2,26 @@ package discoveryutils
import (
"crypto/tls"
"flag"
"fmt"
"net"
"net/http"
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
"github.com/VictoriaMetrics/fasthttp"
)
var (
maxConcurrency = flag.Int("promscrape.discovery.concurrency", 500, "The maximum number of concurrent requests to Prometheus autodiscovery API (Consul, Kubernetes, etc.)")
maxWaitTime = flag.Duration("promscrape.discovery.concurrentWaitTime", time.Minute, "The maximum duration for waiting to perform API requests "+
"if more than -promscrape.discovery.concurrency requests are simultaneously performed")
)
var defaultClient = &http.Client{
Timeout: 30 * time.Second,
}
@@ -56,6 +65,7 @@ func NewClient(apiServer string, ac *promauth.Config) (*Client, error) {
ReadTimeout: time.Minute,
WriteTimeout: 10 * time.Second,
MaxResponseBodySize: 300 * 1024 * 1024,
MaxConns: *maxConcurrency,
}
return &Client{
hc: hc,
@@ -65,8 +75,30 @@ func NewClient(apiServer string, ac *promauth.Config) (*Client, error) {
}, nil
}
var (
concurrencyLimitCh chan struct{}
concurrencyLimitChOnce sync.Once
)
func concurrencyLimitChInit() {
concurrencyLimitCh = make(chan struct{}, *maxConcurrency)
}
// GetAPIResponse returns response for the given absolute path.
func (c *Client) GetAPIResponse(path string) ([]byte, error) {
// Limit the number of concurrent API requests.
concurrencyLimitChOnce.Do(concurrencyLimitChInit)
t := timerpool.Get(*maxWaitTime)
select {
case concurrencyLimitCh <- struct{}{}:
timerpool.Put(t)
case <-t.C:
timerpool.Put(t)
return nil, fmt.Errorf("too many outstanding requests to %q; try increasing -promscrape.discovery.concurrentWaitTime=%s or -promscrape.discovery.concurrency=%d",
c.apiServer, *maxWaitTime, *maxConcurrency)
}
defer func() { <-concurrencyLimitCh }()
requestURL := c.apiServer + path
var u fasthttp.URI
u.Update(requestURL)

View File

@@ -3,6 +3,8 @@ package discoveryutils
import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
)
// ConfigMap is a map for storing discovery api configs.
@@ -37,7 +39,7 @@ func (cm *ConfigMap) Get(key interface{}, newConfig func() (interface{}, error))
e := cm.m[key]
if e != nil {
e.lastAccessTime = time.Now()
e.lastAccessTime = fasttime.UnixTimestamp()
return e.cfg, nil
}
cfg, err := newConfig()
@@ -46,17 +48,18 @@ func (cm *ConfigMap) Get(key interface{}, newConfig func() (interface{}, error))
}
cm.m[key] = &configMapEntry{
cfg: cfg,
lastAccessTime: time.Now(),
lastAccessTime: fasttime.UnixTimestamp(),
}
return cfg, nil
}
func (cm *ConfigMap) cleaner() {
tc := time.NewTicker(15 * time.Minute)
for currentTime := range tc.C {
for range tc.C {
currentTime := fasttime.UnixTimestamp()
cm.mu.Lock()
for k, e := range cm.m {
if currentTime.Sub(e.lastAccessTime) > 10*time.Minute {
if currentTime-e.lastAccessTime > 10*60 {
delete(cm.m, k)
}
}
@@ -66,5 +69,5 @@ func (cm *ConfigMap) cleaner() {
type configMapEntry struct {
cfg interface{}
lastAccessTime time.Time
lastAccessTime uint64
}

View File

@@ -37,6 +37,15 @@ var (
"See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config for details")
)
// CheckConfig checks -promscrape.config for errors and unsupported options.
func CheckConfig() error {
if *promscrapeConfigFile == "" {
return fmt.Errorf("missing -promscrape.config option")
}
_, _, err := loadConfig(*promscrapeConfigFile)
return err
}
// Init initializes Prometheus scraper with config from the `-promscrape.config`.
//
// Scraped data is passed to pushData.

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/metrics"
)
@@ -69,7 +70,7 @@ func (ctx *streamContext) Read(r io.Reader) bool {
rows := ctx.Rows.Rows
// Fill missing timestamps with the current timestamp rounded to seconds.
currentTimestamp := time.Now().Unix()
currentTimestamp := int64(fasttime.UnixTimestamp())
for i := range rows {
r := &rows[i]
if r.Timestamp == 0 {

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/metrics"
)
@@ -68,7 +69,7 @@ func (ctx *streamContext) Read(r io.Reader) bool {
rows := ctx.Rows.Rows
// Fill in missing timestamps
currentTimestamp := time.Now().Unix()
currentTimestamp := int64(fasttime.UnixTimestamp())
for i := range rows {
r := &rows[i]
if r.Timestamp == 0 {

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/metrics"
)
@@ -67,7 +68,7 @@ func ParseStream(req *http.Request, callback func(rows []Row) error) error {
rows := ctx.Rows.Rows
// Fill in missing timestamps
currentTimestamp := time.Now().Unix()
currentTimestamp := int64(fasttime.UnixTimestamp())
for i := range rows {
r := &rows[i]
if r.Timestamp == 0 {

View File

@@ -189,7 +189,7 @@ func unmarshalBlockHeaders(dst []blockHeader, src []byte, blockHeadersCount int)
for len(src) > 0 {
tmp, err := bh.Unmarshal(src)
if err != nil {
return nil, fmt.Errorf("cannot unmarshal block header: %s", err)
return dst, fmt.Errorf("cannot unmarshal block header: %s", err)
}
src = tmp
dst = append(dst, bh)
@@ -199,12 +199,12 @@ func unmarshalBlockHeaders(dst []blockHeader, src []byte, blockHeadersCount int)
// Verify the number of read block headers.
if len(newBHS) != blockHeadersCount {
return nil, fmt.Errorf("invalid number of block headers found: %d; want %d block headers", len(newBHS), blockHeadersCount)
return dst, fmt.Errorf("invalid number of block headers found: %d; want %d block headers", len(newBHS), blockHeadersCount)
}
// Verify that block headers are sorted by tsid.
if !sort.SliceIsSorted(newBHS, func(i, j int) bool { return newBHS[i].TSID.Less(&newBHS[j].TSID) }) {
return nil, fmt.Errorf("block headers must be sorted by tsid; unmarshaled unsorted block headers: %+v", newBHS)
return dst, fmt.Errorf("block headers must be sorted by tsid; unmarshaled unsorted block headers: %+v", newBHS)
}
return dst, nil

View File

@@ -71,7 +71,10 @@ func (bsw *blockStreamWriter) reset() {
// InitFromInmemoryPart initialzes bsw from inmemory part.
func (bsw *blockStreamWriter) InitFromInmemoryPart(mp *inmemoryPart) {
bsw.reset()
bsw.compressLevel = 0
// Use the minimum compression level for in-memory blocks,
// since they are going to be re-compressed during the merge into file-based blocks.
bsw.compressLevel = -5 // See https://github.com/facebook/zstd/releases/tag/v1.3.4
bsw.timestampsWriter = &mp.timestampsData
bsw.valuesWriter = &mp.valuesData

View File

@@ -15,6 +15,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -633,7 +634,7 @@ func (db *indexDB) generateTSID(dst *TSID, metricName []byte, mn *MetricName) er
if len(mn.Tags) > 1 {
dst.InstanceID = uint32(xxhash.Sum64(mn.Tags[1].Value))
}
dst.MetricID = getUniqueUint64()
dst.MetricID = generateUniqueMetricID()
return nil
}
@@ -1233,7 +1234,7 @@ func (db *indexDB) updateDeletedMetricIDs(metricIDs *uint64set.Set) {
}
func (is *indexSearch) getStartDateForPerDayInvertedIndex() (uint64, error) {
minDate := uint64(timestampFromTime(time.Now())) / msecPerDay
minDate := fasttime.UnixDate()
kb := &is.kb
ts := &is.ts
kb.B = append(kb.B[:0], nsPrefixDateTagToMetricIDs)
@@ -1866,7 +1867,7 @@ func (is *indexSearch) searchMetricIDs(tfss []*TagFilters, tr TimeRange, maxMetr
return nil, err
}
if metricIDs.Len() > maxMetrics {
return nil, fmt.Errorf("the number or unique timeseries exceeds %d; either narrow down the search or increase -search.maxUniqueTimeseries", maxMetrics)
return nil, fmt.Errorf("the number of unique timeseries exceeds %d; either narrow down the search or increase -search.maxUniqueTimeseries", maxMetrics)
}
// Stop the iteration, since we cannot find more metric ids with the remaining tfss.
break
@@ -1875,7 +1876,7 @@ func (is *indexSearch) searchMetricIDs(tfss []*TagFilters, tr TimeRange, maxMetr
return nil, err
}
if metricIDs.Len() > maxMetrics {
return nil, fmt.Errorf("the number or matching unique timeseries exceeds %d; either narrow down the search or increase -search.maxUniqueTimeseries", maxMetrics)
return nil, fmt.Errorf("the number of matching unique timeseries exceeds %d; either narrow down the search or increase -search.maxUniqueTimeseries", maxMetrics)
}
}
if metricIDs.Len() == 0 {
@@ -2529,19 +2530,7 @@ func (is *indexSearch) getMetricIDsForRecentHours(tr TimeRange, maxMetrics int)
return nil, false
}
func (db *indexDB) storeDateMetricID(date, metricID uint64) error {
is := db.getIndexSearch()
ok, err := is.hasDateMetricID(date, metricID)
db.putIndexSearch(is)
if err != nil {
return err
}
if ok {
// Fast path: the (date, metricID) entry already exists in the db.
return nil
}
// Slow path: create (date, metricID) entries.
func (is *indexSearch) storeDateMetricID(date, metricID uint64) error {
items := getIndexItems()
defer putIndexItems(items)
@@ -2555,12 +2544,16 @@ func (db *indexDB) storeDateMetricID(date, metricID uint64) error {
defer kbPool.Put(kb)
mn := GetMetricName()
defer PutMetricName(mn)
kb.B, err = db.searchMetricName(kb.B[:0], metricID)
var err error
// There is no need in searching for metric name in is.db.extDB,
// Since the storeDateMetricID function is called only after the metricID->metricName
// is added into the current is.db.
kb.B, err = is.searchMetricName(kb.B[:0], metricID)
if err != nil {
if err == io.EOF {
logger.Errorf("missing metricName by metricID %d; this could be the case after unclean shutdown; "+
"deleting the metricID, so it could be re-created next time", metricID)
if err := db.deleteMetricIDs([]uint64{metricID}); err != nil {
if err := is.db.deleteMetricIDs([]uint64{metricID}); err != nil {
return fmt.Errorf("cannot delete metricID %d after unclean shutdown: %s", metricID, err)
}
return nil
@@ -2585,8 +2578,7 @@ func (db *indexDB) storeDateMetricID(date, metricID uint64) error {
items.B = encoding.MarshalUint64(items.B, metricID)
items.Next()
}
if err = db.tb.AddItems(items.Items); err != nil {
if err = is.db.tb.AddItems(items.Items); err != nil {
return fmt.Errorf("cannot add per-day entires for metricID %d: %s", metricID, err)
}
return nil
@@ -2785,14 +2777,17 @@ func (is *indexSearch) intersectMetricIDsWithTagFilterNocache(tf *tagFilter, fil
var kbPool bytesutil.ByteBufferPool
// Returns local unique MetricID.
func getUniqueUint64() uint64 {
return atomic.AddUint64(&uniqueUint64, 1)
func generateUniqueMetricID() uint64 {
// It is expected that metricIDs returned from this function must be dense.
// If they will be sparse, then this may hurt metric_ids intersection
// performance with uint64set.Set.
return atomic.AddUint64(&nextUniqueMetricID, 1)
}
// This number mustn't go backwards on restarts, otherwise metricID
// collisions are possible. So don't change time on the server
// between VictoriaMetrics restarts.
var uniqueUint64 = uint64(time.Now().UnixNano())
var nextUniqueMetricID = uint64(time.Now().UnixNano())
func marshalCommonPrefix(dst []byte, nsPrefix byte) []byte {
dst = append(dst, nsPrefix)

View File

@@ -685,7 +685,7 @@ func testIndexDBGetOrCreateTSIDByName(db *indexDB, metricGroups int) ([]MetricNa
date := uint64(timestampFromTime(time.Now())) / msecPerDay
for i := range tsids {
tsid := &tsids[i]
if err := db.storeDateMetricID(date, tsid.MetricID); err != nil {
if err := is.storeDateMetricID(date, tsid.MetricID); err != nil {
return nil, nil, fmt.Errorf("error in storeDateMetricID(%d, %d): %s", date, tsid.MetricID, err)
}
}
@@ -1476,7 +1476,7 @@ func TestSearchTSIDWithTimeRange(t *testing.T) {
date := baseDate - uint64(day*msecPerDay)
for i := range tsids {
tsid := &tsids[i]
if err := db.storeDateMetricID(date, tsid.MetricID); err != nil {
if err := is.storeDateMetricID(date, tsid.MetricID); err != nil {
t.Fatalf("error in storeDateMetricID(%d, %d): %s", date, tsid.MetricID, err)
}
}

View File

@@ -2,9 +2,9 @@ package storage
import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
@@ -17,7 +17,7 @@ type inmemoryPart struct {
indexData bytesutil.ByteBuffer
metaindexData bytesutil.ByteBuffer
creationTime time.Time
creationTime uint64
}
// Reset resets mp.
@@ -29,7 +29,7 @@ func (mp *inmemoryPart) Reset() {
mp.indexData.Reset()
mp.metaindexData.Reset()
mp.creationTime = time.Time{}
mp.creationTime = 0
}
// InitFromRows initializes mp from the given rows.
@@ -42,7 +42,7 @@ func (mp *inmemoryPart) InitFromRows(rows []rawRow) {
rrm := getRawRowsMarshaler()
rrm.marshalToInmemoryPart(mp, rows)
putRawRowsMarshaler(rrm)
mp.creationTime = time.Now()
mp.creationTime = fasttime.UnixTimestamp()
}
// NewPart creates new part from mp.

View File

@@ -7,6 +7,7 @@ import (
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -117,6 +118,7 @@ func newPart(ph *partHeader, path string, size uint64, metaindexReader filestrea
p.indexFile = indexFile
p.metaindex = metaindex
p.ibCache = newIndexBlockCache()
if len(errors) > 0 {
// Return only the first error, since it has no sense in returning all errors.
@@ -125,8 +127,6 @@ func newPart(ph *partHeader, path string, size uint64, metaindexReader filestrea
return nil, err
}
p.ibCache = newIndexBlockCache()
return &p, nil
}
@@ -229,7 +229,7 @@ func (ibc *indexBlockCache) cleaner() {
}
func (ibc *indexBlockCache) cleanByTimeout() {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
ibc.mu.Lock()
for k, ibe := range ibc.m {
// Delete items accessed more than 10 minutes ago.
@@ -248,7 +248,7 @@ func (ibc *indexBlockCache) Get(k uint64) *indexBlock {
ibc.mu.RUnlock()
if ibe != nil {
currentTime := atomic.LoadUint64(&currentTimestamp)
currentTime := fasttime.UnixTimestamp()
if atomic.LoadUint64(&ibe.lastAccessTime) != currentTime {
atomic.StoreUint64(&ibe.lastAccessTime, currentTime)
}
@@ -258,7 +258,7 @@ func (ibc *indexBlockCache) Get(k uint64) *indexBlock {
return nil
}
func (ibc *indexBlockCache) Put(k uint64, ib *indexBlock) bool {
func (ibc *indexBlockCache) Put(k uint64, ib *indexBlock) {
ibc.mu.Lock()
// Clean superflouos cache entries.
@@ -277,12 +277,11 @@ func (ibc *indexBlockCache) Put(k uint64, ib *indexBlock) bool {
// Store frequently requested ib in the cache.
ibe := &indexBlockCacheEntry{
lastAccessTime: atomic.LoadUint64(&currentTimestamp),
lastAccessTime: fasttime.UnixTimestamp(),
ib: ib,
}
ibc.m[k] = ibe
ibc.mu.Unlock()
return true
}
func (ibc *indexBlockCache) Requests() uint64 {
@@ -299,16 +298,3 @@ func (ibc *indexBlockCache) Len() uint64 {
ibc.mu.Unlock()
return n
}
func init() {
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for tm := range ticker.C {
t := uint64(tm.Unix())
atomic.StoreUint64(&currentTimestamp, t)
}
}()
}
var currentTimestamp uint64

View File

@@ -36,9 +36,6 @@ type partSearch struct {
bhs []blockHeader
// Pointer to index block, which may be reused
indexBlockReuse *indexBlock
compressedIndexBuf []byte
indexBuf []byte
@@ -53,10 +50,6 @@ func (ps *partSearch) reset() {
ps.metaindex = nil
ps.ibCache = nil
ps.bhs = nil
if ps.indexBlockReuse != nil {
putIndexBlock(ps.indexBlockReuse)
ps.indexBlockReuse = nil
}
ps.compressedIndexBuf = ps.compressedIndexBuf[:0]
ps.indexBuf = ps.indexBuf[:0]
ps.err = nil
@@ -161,10 +154,6 @@ func (ps *partSearch) nextBHS() bool {
// Found the index block which may contain the required data
// for the ps.BlockRef.bh.TSID and the given timestamp range.
if ps.indexBlockReuse != nil {
putIndexBlock(ps.indexBlockReuse)
ps.indexBlockReuse = nil
}
indexBlockKey := mr.IndexBlockOffset
ib := ps.ibCache.Get(indexBlockKey)
if ib == nil {
@@ -176,9 +165,7 @@ func (ps *partSearch) nextBHS() bool {
&ps.p.ph, mr.IndexBlockOffset, mr.IndexBlockSize, err)
return false
}
if ok := ps.ibCache.Put(indexBlockKey, ib); !ok {
ps.indexBlockReuse = ib
}
ps.ibCache.Put(indexBlockKey, ib)
}
ps.bhs = ib.bhs
return true

View File

@@ -16,6 +16,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -445,7 +446,7 @@ func (rrs *rawRowsShards) Len() int {
type rawRowsShard struct {
lock sync.Mutex
rows []rawRow
lastFlushTime time.Time
lastFlushTime uint64
}
func (rrs *rawRowsShard) Len() int {
@@ -478,7 +479,7 @@ func (rrs *rawRowsShard) addRows(pt *partition, rows []rawRow) {
rr := getRawRowsMaxSize()
rrs.rows, rr.rows = rr.rows, rrs.rows
rrss = append(rrss, rr)
rrs.lastFlushTime = time.Now()
rrs.lastFlushTime = fasttime.UnixTimestamp()
}
rrs.lock.Unlock()
@@ -722,10 +723,14 @@ func (rrs *rawRowsShards) flush(pt *partition, isFinal bool) {
func (rrs *rawRowsShard) flush(pt *partition, isFinal bool) {
var rr *rawRows
currentTime := time.Now()
currentTime := fasttime.UnixTimestamp()
flushSeconds := int64(rawRowsFlushInterval.Seconds())
if flushSeconds <= 0 {
flushSeconds = 1
}
rrs.lock.Lock()
if isFinal || currentTime.Sub(rrs.lastFlushTime) > rawRowsFlushInterval {
if isFinal || currentTime-rrs.lastFlushTime > uint64(flushSeconds) {
rr = getRawRowsMaxSize()
rrs.rows, rr.rows = rr.rows, rrs.rows
}
@@ -764,7 +769,11 @@ func (pt *partition) inmemoryPartsFlusher() {
}
func (pt *partition) flushInmemoryParts(dstPws []*partWrapper, force bool) ([]*partWrapper, error) {
currentTime := time.Now()
currentTime := fasttime.UnixTimestamp()
flushSeconds := int64(inmemoryPartsFlushInterval.Seconds())
if flushSeconds <= 0 {
flushSeconds = 1
}
// Inmemory parts may present only in small parts.
pt.partsLock.Lock()
@@ -772,7 +781,7 @@ func (pt *partition) flushInmemoryParts(dstPws []*partWrapper, force bool) ([]*p
if pw.mp == nil || pw.isInMerge {
continue
}
if force || currentTime.Sub(pw.mp.creationTime) >= inmemoryPartsFlushInterval {
if force || currentTime-pw.mp.creationTime >= uint64(flushSeconds) {
pw.isInMerge = true
dstPws = append(dstPws, pw)
}
@@ -876,7 +885,7 @@ const (
func (pt *partition) partsMerger(mergerFunc func(isFinal bool) error) error {
sleepTime := minMergeSleepTime
var lastMergeTime time.Time
var lastMergeTime uint64
isFinal := false
t := time.NewTimer(sleepTime)
for {
@@ -884,7 +893,7 @@ func (pt *partition) partsMerger(mergerFunc func(isFinal bool) error) error {
if err == nil {
// Try merging additional parts.
sleepTime = minMergeSleepTime
lastMergeTime = time.Now()
lastMergeTime = fasttime.UnixTimestamp()
isFinal = false
continue
}
@@ -895,10 +904,10 @@ func (pt *partition) partsMerger(mergerFunc func(isFinal bool) error) error {
if err != errNothingToMerge {
return err
}
if time.Since(lastMergeTime) > 30*time.Second {
if fasttime.UnixTimestamp()-lastMergeTime > 30 {
// We have free time for merging into bigger parts.
// This should improve select performance.
lastMergeTime = time.Now()
lastMergeTime = fasttime.UnixTimestamp()
isFinal = true
continue
}
@@ -939,7 +948,7 @@ func mustGetFreeDiskSpace(path string) uint64 {
defer freeSpaceMapLock.Unlock()
e, ok := freeSpaceMap[path]
if ok && time.Since(e.updateTime) < time.Second {
if ok && fasttime.UnixTimestamp()-e.updateTime < 2 {
// Fast path - the entry is fresh.
return e.freeSpace
}
@@ -947,7 +956,7 @@ func mustGetFreeDiskSpace(path string) uint64 {
// Slow path.
// Determine the amount of free space on bigPartsPath.
e.freeSpace = fs.MustGetFreeSpace(path)
e.updateTime = time.Now()
e.updateTime = fasttime.UnixTimestamp()
freeSpaceMap[path] = e
return e.freeSpace
}
@@ -958,7 +967,7 @@ var (
)
type freeSpaceEntry struct {
updateTime time.Time
updateTime uint64
freeSpace uint64
}
@@ -1060,8 +1069,10 @@ func (pt *partition) mergeParts(pws []*partWrapper, stopCh <-chan struct{}) erro
}
outRowsCount := uint64(0)
outBlocksCount := uint64(0)
for _, pw := range pws {
outRowsCount += pw.p.ph.RowsCount
outBlocksCount += pw.p.ph.BlocksCount
}
isBigPart := outRowsCount > maxRowsPerSmallPart()
nocache := isBigPart
@@ -1075,7 +1086,7 @@ func (pt *partition) mergeParts(pws []*partWrapper, stopCh <-chan struct{}) erro
mergeIdx := pt.nextMergeIdx()
tmpPartPath := fmt.Sprintf("%s/tmp/%016X", ptPath, mergeIdx)
bsw := getBlockStreamWriter()
compressLevel := getCompressLevelForRowsCount(outRowsCount)
compressLevel := getCompressLevelForRowsCount(outRowsCount, outBlocksCount)
if err := bsw.InitFromFilePart(tmpPartPath, nocache, compressLevel); err != nil {
return fmt.Errorf("cannot create destination part %q: %s", tmpPartPath, err)
}
@@ -1176,24 +1187,28 @@ func (pt *partition) mergeParts(pws []*partWrapper, stopCh <-chan struct{}) erro
d := time.Since(startTime)
if d > 10*time.Second {
logger.Infof("merged %d rows in %.3f seconds at %d rows/sec to %q; sizeBytes: %d",
outRowsCount, d.Seconds(), int(float64(outRowsCount)/d.Seconds()), dstPartPath, newPSize)
logger.Infof("merged %d rows across %d blocks in %.3f seconds at %d rows/sec to %q; sizeBytes: %d",
outRowsCount, outBlocksCount, d.Seconds(), int(float64(outRowsCount)/d.Seconds()), dstPartPath, newPSize)
}
return nil
}
func getCompressLevelForRowsCount(rowsCount uint64) int {
if rowsCount <= 1<<19 {
func getCompressLevelForRowsCount(rowsCount, blocksCount uint64) int {
avgRowsPerBlock := rowsCount / blocksCount
if avgRowsPerBlock <= 200 {
return -1
}
if avgRowsPerBlock <= 500 {
return 1
}
if rowsCount <= 1<<22 {
if avgRowsPerBlock <= 1000 {
return 2
}
if rowsCount <= 1<<25 {
if avgRowsPerBlock <= 2000 {
return 3
}
if rowsCount <= 1<<28 {
if avgRowsPerBlock <= 4000 {
return 4
}
return 5

View File

@@ -161,16 +161,20 @@ func testSearchGeneric(t *testing.T, forcePerDayInvertedIndex bool) {
ch <- testSearchInternal(st, tr, mrs, accountsCount)
}()
}
var firstError error
for i := 0; i < cap(ch); i++ {
select {
case err := <-ch:
if err != nil {
t.Fatalf("unexpected error: %s", err)
if err != nil && firstError == nil {
firstError = err
}
case <-time.After(10 * time.Second):
t.Fatalf("timeout")
}
}
if firstError != nil {
t.Fatalf("unexpected error: %s", firstError)
}
})
}

View File

@@ -15,8 +15,8 @@ import (
"time"
"unsafe"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@@ -39,6 +39,10 @@ type Storage struct {
addRowsConcurrencyLimitTimeout uint64
addRowsConcurrencyDroppedRows uint64
slowRowInserts uint64
slowPerDayIndexInserts uint64
slowMetricNameLoads uint64
path string
cachePath string
retentionMonths int
@@ -68,16 +72,27 @@ type Storage struct {
// Fast cache for MetricID values occurred during the previous hour.
prevHourMetricIDs atomic.Value
// Fast cache for pre-populating per-day inverted index for the next day.
// This is needed in order to remove CPU usage spikes at 00:00 UTC
// due to creation of per-day inverted index for active time series.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/430 for details.
nextDayMetricIDs atomic.Value
// Pending MetricID values to be added to currHourMetricIDs.
pendingHourEntriesLock sync.Mutex
pendingHourEntries *uint64set.Set
// Pending MetricIDs to be added to nextDayMetricIDs.
pendingNextDayMetricIDsLock sync.Mutex
pendingNextDayMetricIDs *uint64set.Set
// metricIDs for pre-fetched metricNames in the prefetchMetricNames function.
prefetchedMetricIDs atomic.Value
stop chan struct{}
currHourMetricIDsUpdaterWG sync.WaitGroup
nextDayMetricIDsUpdaterWG sync.WaitGroup
retentionWatcherWG sync.WaitGroup
prefetchedMetricIDsCleanerWG sync.WaitGroup
@@ -131,13 +146,18 @@ func OpenStorage(path string, retentionMonths int) (*Storage, error) {
s.metricNameCache = s.mustLoadCache("MetricID->MetricName", "metricID_metricName", mem/8)
s.dateMetricIDCache = newDateMetricIDCache()
hour := uint64(timestampFromTime(time.Now())) / msecPerHour
hour := fasttime.UnixHour()
hmCurr := s.mustLoadHourMetricIDs(hour, "curr_hour_metric_ids")
hmPrev := s.mustLoadHourMetricIDs(hour-1, "prev_hour_metric_ids")
s.currHourMetricIDs.Store(hmCurr)
s.prevHourMetricIDs.Store(hmPrev)
s.pendingHourEntries = &uint64set.Set{}
date := fasttime.UnixDate()
nextDayMetricIDs := s.mustLoadNextDayMetricIDs(date)
s.nextDayMetricIDs.Store(nextDayMetricIDs)
s.pendingNextDayMetricIDs = &uint64set.Set{}
s.prefetchedMetricIDs.Store(&uint64set.Set{})
// Load indexdb
@@ -163,6 +183,7 @@ func OpenStorage(path string, retentionMonths int) (*Storage, error) {
s.tb = tb
s.startCurrHourMetricIDsUpdater()
s.startNextDayMetricIDsUpdater()
s.startRetentionWatcher()
s.startPrefetchedMetricIDsCleaner()
@@ -306,6 +327,10 @@ type Metrics struct {
AddRowsConcurrencyCapacity uint64
AddRowsConcurrencyCurrent uint64
SlowRowInserts uint64
SlowPerDayIndexInserts uint64
SlowMetricNameLoads uint64
TSIDCacheSize uint64
TSIDCacheSizeBytes uint64
TSIDCacheRequests uint64
@@ -332,6 +357,9 @@ type Metrics struct {
HourMetricIDCacheSize uint64
HourMetricIDCacheSizeBytes uint64
NextDayMetricIDCacheSize uint64
NextDayMetricIDCacheSizeBytes uint64
PrefetchedMetricIDsSize uint64
PrefetchedMetricIDsSizeBytes uint64
@@ -357,6 +385,10 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh))
m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh))
m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
m.SlowMetricNameLoads += atomic.LoadUint64(&s.slowMetricNameLoads)
var cs fastcache.Stats
s.tsidCache.UpdateStats(&cs)
m.TSIDCacheSize += cs.EntriesCount
@@ -396,6 +428,10 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.HourMetricIDCacheSizeBytes += hmCurr.m.SizeBytes()
m.HourMetricIDCacheSizeBytes += hmPrev.m.SizeBytes()
nextDayMetricIDs := &s.nextDayMetricIDs.Load().(*byDateMetricIDEntry).v
m.NextDayMetricIDCacheSize += uint64(nextDayMetricIDs.Len())
m.NextDayMetricIDCacheSizeBytes += nextDayMetricIDs.SizeBytes()
prefetchedMetricIDs := s.prefetchedMetricIDs.Load().(*uint64set.Set)
m.PrefetchedMetricIDsSize += uint64(prefetchedMetricIDs.Len())
m.PrefetchedMetricIDsSizeBytes += uint64(prefetchedMetricIDs.SizeBytes())
@@ -453,6 +489,14 @@ func (s *Storage) startCurrHourMetricIDsUpdater() {
}()
}
func (s *Storage) startNextDayMetricIDsUpdater() {
s.nextDayMetricIDsUpdaterWG.Add(1)
go func() {
s.nextDayMetricIDsUpdater()
s.nextDayMetricIDsUpdaterWG.Done()
}()
}
var currHourMetricIDsUpdateInterval = time.Second * 10
func (s *Storage) currHourMetricIDsUpdater() {
@@ -469,6 +513,22 @@ func (s *Storage) currHourMetricIDsUpdater() {
}
}
var nextDayMetricIDsUpdateInterval = time.Second * 11
func (s *Storage) nextDayMetricIDsUpdater() {
ticker := time.NewTicker(nextDayMetricIDsUpdateInterval)
defer ticker.Stop()
for {
select {
case <-s.stop:
s.updateNextDayMetricIDs()
return
case <-ticker.C:
s.updateNextDayMetricIDs()
}
}
}
func (s *Storage) mustRotateIndexDB() {
// Create new indexdb table.
newTableName := nextIndexDBTableName()
@@ -500,6 +560,8 @@ func (s *Storage) mustRotateIndexDB() {
// Do not flush metricIDCache and metricNameCache, since all the metricIDs
// from prev idb remain valid after the rotation.
// There is no need in resetting nextDayMetricIDs, since it should be automatically reset every day.
}
// MustClose closes the storage.
@@ -508,6 +570,7 @@ func (s *Storage) MustClose() {
s.retentionWatcherWG.Wait()
s.currHourMetricIDsUpdaterWG.Wait()
s.nextDayMetricIDsUpdaterWG.Wait()
s.tb.MustClose()
s.idb().MustClose()
@@ -522,21 +585,70 @@ func (s *Storage) MustClose() {
hmPrev := s.prevHourMetricIDs.Load().(*hourMetricIDs)
s.mustSaveHourMetricIDs(hmPrev, "prev_hour_metric_ids")
nextDayMetricIDs := s.nextDayMetricIDs.Load().(*byDateMetricIDEntry)
s.mustSaveNextDayMetricIDs(nextDayMetricIDs)
// Release lock file.
if err := s.flockF.Close(); err != nil {
logger.Panicf("FATAL: cannot close lock file %q: %s", s.flockF.Name(), err)
}
}
func (s *Storage) mustLoadHourMetricIDs(hour uint64, name string) *hourMetricIDs {
func (s *Storage) mustLoadNextDayMetricIDs(date uint64) *byDateMetricIDEntry {
e := &byDateMetricIDEntry{
date: date,
}
name := "next_day_metric_ids"
path := s.cachePath + "/" + name
logger.Infof("loading %s from %q...", name, path)
startTime := time.Now()
if !fs.IsPathExist(path) {
logger.Infof("nothing to load from %q", path)
return &hourMetricIDs{
hour: hour,
}
return e
}
src, err := ioutil.ReadFile(path)
if err != nil {
logger.Panicf("FATAL: cannot read %s: %s", path, err)
}
srcOrigLen := len(src)
if len(src) < 16 {
logger.Errorf("discarding %s, since it has broken header; got %d bytes; want %d bytes", path, len(src), 16)
return e
}
// Unmarshal header
dateLoaded := encoding.UnmarshalUint64(src)
src = src[8:]
if dateLoaded != date {
logger.Infof("discarding %s, since it contains data for stale date; got %d; want %d", path, dateLoaded, date)
return e
}
// Unmarshal uint64set
m, tail, err := unmarshalUint64Set(src)
if err != nil {
logger.Infof("discarding %s because cannot load uint64set: %s", path, err)
return e
}
if len(tail) > 0 {
logger.Infof("discarding %s because non-empty tail left; len(tail)=%d", path, len(tail))
return e
}
e.v = *m
logger.Infof("loaded %s from %q in %.3f seconds; entriesCount: %d; sizeBytes: %d", name, path, time.Since(startTime).Seconds(), m.Len(), srcOrigLen)
return e
}
func (s *Storage) mustLoadHourMetricIDs(hour uint64, name string) *hourMetricIDs {
hm := &hourMetricIDs{
hour: hour,
}
path := s.cachePath + "/" + name
logger.Infof("loading %s from %q...", name, path)
startTime := time.Now()
if !fs.IsPathExist(path) {
logger.Infof("nothing to load from %q", path)
return hm
}
src, err := ioutil.ReadFile(path)
if err != nil {
@@ -545,9 +657,7 @@ func (s *Storage) mustLoadHourMetricIDs(hour uint64, name string) *hourMetricIDs
srcOrigLen := len(src)
if len(src) < 24 {
logger.Errorf("discarding %s, since it has broken header; got %d bytes; want %d bytes", path, len(src), 24)
return &hourMetricIDs{
hour: hour,
}
return hm
}
// Unmarshal header
@@ -556,34 +666,43 @@ func (s *Storage) mustLoadHourMetricIDs(hour uint64, name string) *hourMetricIDs
hourLoaded := encoding.UnmarshalUint64(src)
src = src[8:]
if hourLoaded != hour {
logger.Infof("discarding %s, since it contains outdated hour; got %d; want %d", name, hourLoaded, hour)
return &hourMetricIDs{
hour: hour,
}
logger.Infof("discarding %s, since it contains outdated hour; got %d; want %d", path, hourLoaded, hour)
return hm
}
// Unmarshal hm.m
hmLen := encoding.UnmarshalUint64(src)
src = src[8:]
if uint64(len(src)) < 8*hmLen {
logger.Errorf("discarding %s, since it has broken hm.m data; got %d bytes; want at least %d bytes", path, len(src), 8*hmLen)
return &hourMetricIDs{
hour: hour,
}
// Unmarshal uint64set
m, tail, err := unmarshalUint64Set(src)
if err != nil {
logger.Infof("discarding %s because cannot load uint64set: %s", path, err)
return hm
}
m := &uint64set.Set{}
for i := uint64(0); i < hmLen; i++ {
metricID := encoding.UnmarshalUint64(src)
src = src[8:]
m.Add(metricID)
if len(tail) > 0 {
logger.Infof("discarding %s because non-empty tail left; len(tail)=%d", path, len(tail))
return hm
}
hm.m = m
hm.isFull = isFull != 0
logger.Infof("loaded %s from %q in %.3f seconds; entriesCount: %d; sizeBytes: %d", name, path, time.Since(startTime).Seconds(), m.Len(), srcOrigLen)
return hm
}
logger.Infof("loaded %s from %q in %.3f seconds; entriesCount: %d; sizeBytes: %d", name, path, time.Since(startTime).Seconds(), hmLen, srcOrigLen)
return &hourMetricIDs{
m: m,
hour: hourLoaded,
isFull: isFull != 0,
func (s *Storage) mustSaveNextDayMetricIDs(e *byDateMetricIDEntry) {
name := "next_day_metric_ids"
path := s.cachePath + "/" + name
logger.Infof("saving %s to %q...", name, path)
startTime := time.Now()
dst := make([]byte, 0, e.v.Len()*8+16)
// Marshal header
dst = encoding.MarshalUint64(dst, e.date)
// Marshal e.v
dst = marshalUint64Set(dst, &e.v)
if err := ioutil.WriteFile(path, dst, 0644); err != nil {
logger.Panicf("FATAL: cannot write %d bytes to %q: %s", len(dst), path, err)
}
logger.Infof("saved %s to %q in %.3f seconds; entriesCount: %d; sizeBytes: %d", name, path, time.Since(startTime).Seconds(), e.v.Len(), len(dst))
}
func (s *Storage) mustSaveHourMetricIDs(hm *hourMetricIDs, name string) {
@@ -601,13 +720,7 @@ func (s *Storage) mustSaveHourMetricIDs(hm *hourMetricIDs, name string) {
dst = encoding.MarshalUint64(dst, hm.hour)
// Marshal hm.m
dst = encoding.MarshalUint64(dst, uint64(hm.m.Len()))
hm.m.ForEach(func(part []uint64) bool {
for _, metricID := range part {
dst = encoding.MarshalUint64(dst, metricID)
}
return true
})
dst = marshalUint64Set(dst, hm.m)
if err := ioutil.WriteFile(path, dst, 0644); err != nil {
logger.Panicf("FATAL: cannot write %d bytes to %q: %s", len(dst), path, err)
@@ -615,6 +728,32 @@ func (s *Storage) mustSaveHourMetricIDs(hm *hourMetricIDs, name string) {
logger.Infof("saved %s to %q in %.3f seconds; entriesCount: %d; sizeBytes: %d", name, path, time.Since(startTime).Seconds(), hm.m.Len(), len(dst))
}
func unmarshalUint64Set(src []byte) (*uint64set.Set, []byte, error) {
mLen := encoding.UnmarshalUint64(src)
src = src[8:]
if uint64(len(src)) < 8*mLen {
return nil, nil, fmt.Errorf("cannot unmarshal uint64set; got %d bytes; want at least %d bytes", len(src), 8*mLen)
}
m := &uint64set.Set{}
for i := uint64(0); i < mLen; i++ {
metricID := encoding.UnmarshalUint64(src)
src = src[8:]
m.Add(metricID)
}
return m, src, nil
}
func marshalUint64Set(dst []byte, m *uint64set.Set) []byte {
dst = encoding.MarshalUint64(dst, uint64(m.Len()))
m.ForEach(func(part []uint64) bool {
for _, metricID := range part {
dst = encoding.MarshalUint64(dst, metricID)
}
return true
})
return dst
}
func (s *Storage) mustLoadCache(info, name string, sizeBytes int) *workingsetcache.Cache {
path := s.cachePath + "/" + name
logger.Infof("loading %s cache from %q...", info, path)
@@ -682,6 +821,7 @@ func (s *Storage) prefetchMetricNames(tsids []TSID) error {
// It is cheaper to skip pre-fetching and obtain metricNames inline.
return nil
}
atomic.AddUint64(&s.slowMetricNameLoads, uint64(len(metricIDs)))
// Pre-fetch metricIDs.
sort.Sort(metricIDs)
@@ -891,10 +1031,6 @@ var (
)
func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]rawRow, error) {
var is *indexSearch
var mn *MetricName
var kb *bytesutil.ByteBuffer
idb := s.idb()
dmis := idb.getDeletedMetricIDs()
rowsLen := len(rows)
@@ -908,9 +1044,10 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
prevTSID TSID
prevMetricNameRaw []byte
)
var pmrs *pendingMetricRows
minTimestamp, maxTimestamp := s.tb.getMinMaxTimestamps()
// Return only the last error, since it has no sense in returning all errors.
var lastWarn error
// Return only the first error, since it has no sense in returning all errors.
var firstWarn error
for i := range mrs {
mr := &mrs[i]
if math.IsNaN(mr.Value) {
@@ -920,13 +1057,19 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
}
if mr.Timestamp < minTimestamp {
// Skip rows with too small timestamps outside the retention.
lastWarn = fmt.Errorf("cannot insert row with too small timestamp %d outside the retention; minimum allowed timestamp is %d", mr.Timestamp, minTimestamp)
if firstWarn == nil {
firstWarn = fmt.Errorf("cannot insert row with too small timestamp %d outside the retention; minimum allowed timestamp is %d",
mr.Timestamp, minTimestamp)
}
atomic.AddUint64(&s.tooSmallTimestampRows, 1)
continue
}
if mr.Timestamp > maxTimestamp {
// Skip rows with too big timestamps significantly exceeding the current time.
lastWarn = fmt.Errorf("cannot insert row with too big timestamp %d exceeding the current time; maximum allowd timestamp is %d", mr.Timestamp, maxTimestamp)
if firstWarn == nil {
firstWarn = fmt.Errorf("cannot insert row with too big timestamp %d exceeding the current time; maximum allowd timestamp is %d",
mr.Timestamp, maxTimestamp)
}
atomic.AddUint64(&s.tooBigTimestampRows, 1)
continue
}
@@ -941,76 +1084,170 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
r.TSID = prevTSID
continue
}
if s.getTSIDFromCache(&r.TSID, mr.MetricNameRaw) {
if !dmis.Has(r.TSID.MetricID) {
if s.getTSIDFromCache(&r.TSID, mr.MetricNameRaw) && !dmis.Has(r.TSID.MetricID) {
// Fast path - the TSID for the given MetricName has been found in cache and isn't deleted.
prevTSID = r.TSID
prevMetricNameRaw = mr.MetricNameRaw
continue
}
// Slow path - the TSID is missing in the cache.
// Postpone its search in the loop below.
j--
if pmrs == nil {
pmrs = getPendingMetricRows()
}
if err := pmrs.addRow(mr); err != nil {
// Do not stop adding rows on error - just skip invalid row.
// This guarantees that invalid rows don't prevent
// from adding valid rows into the storage.
if firstWarn == nil {
firstWarn = err
}
continue
}
}
if pmrs != nil {
atomic.AddUint64(&s.slowRowInserts, uint64(len(pmrs.pmrs)))
// Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below.
pendingMetricRows := pmrs.pmrs
sort.Slice(pendingMetricRows, func(i, j int) bool {
return string(pendingMetricRows[i].MetricName) < string(pendingMetricRows[j].MetricName)
})
is := idb.getIndexSearch()
prevMetricNameRaw = nil
for i := range pendingMetricRows {
pmr := &pendingMetricRows[i]
mr := &pmr.mr
r := &rows[rowsLen+j]
j++
r.Timestamp = mr.Timestamp
r.Value = mr.Value
r.PrecisionBits = precisionBits
if string(mr.MetricNameRaw) == string(prevMetricNameRaw) {
// Fast path - the current mr contains the same metric name as the previous mr, so it contains the same TSID.
// This path should trigger on bulk imports when many rows contain the same MetricNameRaw.
r.TSID = prevTSID
continue
}
if s.getTSIDFromCache(&r.TSID, mr.MetricNameRaw) && !dmis.Has(r.TSID.MetricID) {
// Fast path - the TSID for the given MetricName has been found in cache and isn't deleted.
prevTSID = r.TSID
prevMetricNameRaw = mr.MetricNameRaw
continue
}
if err := is.GetOrCreateTSIDByName(&r.TSID, pmr.MetricName); err != nil {
// Do not stop adding rows on error - just skip invalid row.
// This guarantees that invalid rows don't prevent
// from adding valid rows into the storage.
if firstWarn == nil {
firstWarn = fmt.Errorf("cannot obtain or create TSID for MetricName %q: %s", pmr.MetricName, err)
}
j--
continue
}
s.putTSIDToCache(&r.TSID, mr.MetricNameRaw)
}
// Slow path - the TSID is missing in the cache. Search for it in the index.
if is == nil {
is = idb.getIndexSearch()
mn = GetMetricName()
kb = kbPool.Get()
}
if err := mn.unmarshalRaw(mr.MetricNameRaw); err != nil {
// Do not stop adding rows on error - just skip invalid row.
// This guarantees that invalid rows don't prevent
// from adding valid rows into the storage.
lastWarn = fmt.Errorf("cannot unmarshal MetricNameRaw %q: %s", mr.MetricNameRaw, err)
j--
continue
}
mn.sortTags()
kb.B = mn.Marshal(kb.B[:0])
if err := is.GetOrCreateTSIDByName(&r.TSID, kb.B); err != nil {
// Do not stop adding rows on error - just skip invalid row.
// This guarantees that invalid rows don't prevent
// from adding valid rows into the storage.
lastWarn = fmt.Errorf("cannot obtain TSID for MetricName %q: %s", kb.B, err)
j--
continue
}
s.putTSIDToCache(&r.TSID, mr.MetricNameRaw)
}
if lastWarn != nil {
logger.Errorf("warn occurred during rows addition: %s", lastWarn)
}
if is != nil {
kbPool.Put(kb)
PutMetricName(mn)
idb.putIndexSearch(is)
putPendingMetricRows(pmrs)
}
if firstWarn != nil {
logger.Errorf("warn occurred during rows addition: %s", firstWarn)
}
rows = rows[:rowsLen+j]
var lastError error
var firstError error
if err := s.tb.AddRows(rows); err != nil {
lastError = fmt.Errorf("cannot add rows to table: %s", err)
firstError = fmt.Errorf("cannot add rows to table: %s", err)
}
if err := s.updatePerDateData(rows, lastError); err != nil && lastError == nil {
lastError = fmt.Errorf("cannot update per-date data: %s", err)
if err := s.updatePerDateData(rows); err != nil && firstError == nil {
firstError = fmt.Errorf("cannot update per-date data: %s", err)
}
if lastError != nil {
return rows, fmt.Errorf("error occurred during rows addition: %s", lastError)
if firstError != nil {
return rows, fmt.Errorf("error occurred during rows addition: %s", firstError)
}
return rows, nil
}
func (s *Storage) updatePerDateData(rows []rawRow, lastError error) error {
type pendingMetricRow struct {
MetricName []byte
mr MetricRow
}
type pendingMetricRows struct {
pmrs []pendingMetricRow
metricNamesBuf []byte
lastMetricNameRaw []byte
lastMetricName []byte
mn MetricName
}
func (pmrs *pendingMetricRows) reset() {
for _, pmr := range pmrs.pmrs {
pmr.MetricName = nil
pmr.mr.MetricNameRaw = nil
}
pmrs.pmrs = pmrs.pmrs[:0]
pmrs.metricNamesBuf = pmrs.metricNamesBuf[:0]
pmrs.lastMetricNameRaw = nil
pmrs.lastMetricName = nil
pmrs.mn.Reset()
}
func (pmrs *pendingMetricRows) addRow(mr *MetricRow) error {
// Do not spend CPU time on re-calculating canonical metricName during bulk import
// of many rows for the same metric.
if string(mr.MetricNameRaw) != string(pmrs.lastMetricNameRaw) {
if err := pmrs.mn.unmarshalRaw(mr.MetricNameRaw); err != nil {
return fmt.Errorf("cannot unmarshal MetricNameRaw %q: %s", mr.MetricNameRaw, err)
}
pmrs.mn.sortTags()
metricNamesBufLen := len(pmrs.metricNamesBuf)
pmrs.metricNamesBuf = pmrs.mn.Marshal(pmrs.metricNamesBuf)
pmrs.lastMetricName = pmrs.metricNamesBuf[metricNamesBufLen:]
pmrs.lastMetricNameRaw = mr.MetricNameRaw
}
pmrs.pmrs = append(pmrs.pmrs, pendingMetricRow{
MetricName: pmrs.lastMetricName,
mr: *mr,
})
return nil
}
func getPendingMetricRows() *pendingMetricRows {
v := pendingMetricRowsPool.Get()
if v == nil {
v = &pendingMetricRows{}
}
return v.(*pendingMetricRows)
}
func putPendingMetricRows(pmrs *pendingMetricRows) {
pmrs.reset()
pendingMetricRowsPool.Put(pmrs)
}
var pendingMetricRowsPool sync.Pool
func (s *Storage) updatePerDateData(rows []rawRow) error {
var date uint64
var hour uint64
var prevTimestamp int64
var (
// These vars are used for speeding up bulk imports when multiple adjancent rows
// contain the same (metricID, date) pairs.
prevMatchedDate uint64
prevMatchedMetricID uint64
prevDate uint64
prevMetricID uint64
)
idb := s.idb()
hm := s.currHourMetricIDs.Load().(*hourMetricIDs)
nextDayMetricIDs := &s.nextDayMetricIDs.Load().(*byDateMetricIDEntry).v
todayShare16bit := uint64((float64(fasttime.UnixTimestamp()%(3600*24)) / (3600 * 24)) * (1 << 16))
type pendingDateMetricID struct {
date uint64
metricID uint64
}
var pendingDateMetricIDs []pendingDateMetricID
for i := range rows {
r := &rows[i]
if r.Timestamp != prevTimestamp {
@@ -1024,6 +1261,21 @@ func (s *Storage) updatePerDateData(rows []rawRow, lastError error) error {
if hm.m.Has(metricID) {
// Fast path: the metricID is in the current hour cache.
// This means the metricID has been already added to per-day inverted index.
// Gradually pre-populate per-day inverted index for the next day
// during the current day.
// This should reduce CPU usage spike and slowdown at the beginning of the next day
// when entries for all the active time series must be added to the index.
// This should address https://github.com/VictoriaMetrics/VictoriaMetrics/issues/430 .
if todayShare16bit > (metricID&(1<<16-1)) && !nextDayMetricIDs.Has(metricID) {
pendingDateMetricIDs = append(pendingDateMetricIDs, pendingDateMetricID{
date: date + 1,
metricID: metricID,
})
s.pendingNextDayMetricIDsLock.Lock()
s.pendingNextDayMetricIDs.Add(metricID)
s.pendingNextDayMetricIDsLock.Unlock()
}
continue
}
s.pendingHourEntriesLock.Lock()
@@ -1032,29 +1284,80 @@ func (s *Storage) updatePerDateData(rows []rawRow, lastError error) error {
}
// Slower path: check global cache for (date, metricID) entry.
if metricID == prevMatchedMetricID && date == prevMatchedDate {
if metricID == prevMetricID && date == prevDate {
// Fast path for bulk import of multiple rows with the same (date, metricID) pairs.
continue
}
prevDate = date
prevMetricID = metricID
if !s.dateMetricIDCache.Has(date, metricID) {
// Slow path: store the (date, metricID) entry in the indexDB.
// It is OK if the (date, metricID) entry is added multiple times to db
// by concurrent goroutines.
pendingDateMetricIDs = append(pendingDateMetricIDs, pendingDateMetricID{
date: date,
metricID: metricID,
})
}
}
if len(pendingDateMetricIDs) == 0 {
// Fast path - there are no new (date, metricID) entires in rows.
return nil
}
// Slow path - add new (date, metricID) entries to indexDB.
atomic.AddUint64(&s.slowPerDayIndexInserts, uint64(len(pendingDateMetricIDs)))
// Sort pendingDateMetricIDs by (date, metricID) in order to speed up `is` search in the loop below.
sort.Slice(pendingDateMetricIDs, func(i, j int) bool {
a := pendingDateMetricIDs[i]
b := pendingDateMetricIDs[j]
if a.date != b.date {
return a.date < b.date
}
return a.metricID < b.metricID
})
idb := s.idb()
is := idb.getIndexSearch()
defer idb.putIndexSearch(is)
var firstError error
prevMetricID = 0
prevDate = 0
for _, dateMetricID := range pendingDateMetricIDs {
date := dateMetricID.date
metricID := dateMetricID.metricID
if metricID == prevMetricID && date == prevDate {
// Fast path for bulk import of multiple rows with the same (date, metricID) pairs.
continue
}
prevDate = date
prevMetricID = metricID
if s.dateMetricIDCache.Has(date, metricID) {
// The metricID has been already added to per-day inverted index.
prevMatchedDate = date
prevMatchedMetricID = metricID
continue
}
// Slow path: store the (date, metricID) entry in the indexDB.
// It is OK if the (date, metricID) entry is added multiple times to db
// by concurrent goroutines.
if err := idb.storeDateMetricID(date, metricID); err != nil {
lastError = err
ok, err := is.hasDateMetricID(date, metricID)
if err != nil {
if firstError == nil {
firstError = fmt.Errorf("error when locating (date=%d, metricID=%d) in database: %s", date, metricID, err)
}
continue
}
if !ok {
// The (date, metricID) entry is missing in the indexDB. Add it there.
if err := is.storeDateMetricID(date, metricID); err != nil {
if firstError == nil {
firstError = fmt.Errorf("error when storing (date=%d, metricID=%d) in database: %s", date, metricID, err)
}
continue
}
}
// The metric must be added to cache only after it has been successfully added to indexDB.
s.dateMetricIDCache.Set(date, metricID)
}
return lastError
return firstError
}
// dateMetricIDCache is fast cache for holding (date, metricID) entries.
@@ -1070,7 +1373,7 @@ type dateMetricIDCache struct {
// Contains mutable map protected by mu
byDateMutable *byDateMetricIDMap
lastSyncTime time.Time
lastSyncTime uint64
mu sync.Mutex
}
@@ -1085,7 +1388,7 @@ func (dmc *dateMetricIDCache) Reset() {
// Do not reset syncsCount and resetsCount
dmc.byDate.Store(newByDateMetricIDMap())
dmc.byDateMutable = newByDateMetricIDMap()
dmc.lastSyncTime = time.Now()
dmc.lastSyncTime = fasttime.UnixTimestamp()
dmc.mu.Unlock()
atomic.AddUint64(&dmc.resetsCount, 1)
@@ -1119,13 +1422,12 @@ func (dmc *dateMetricIDCache) Has(date, metricID uint64) bool {
}
// Slow path. Check mutable map.
currentTime := time.Now()
currentTime := fasttime.UnixTimestamp()
dmc.mu.Lock()
v = dmc.byDateMutable.get(date)
ok := v.Has(metricID)
mustSync := false
if currentTime.Sub(dmc.lastSyncTime) > 10*time.Second {
if currentTime-dmc.lastSyncTime > 10 {
mustSync = true
dmc.lastSyncTime = currentTime
}
@@ -1207,13 +1509,36 @@ type byDateMetricIDEntry struct {
v uint64set.Set
}
func (s *Storage) updateNextDayMetricIDs() {
date := fasttime.UnixDate()
e := s.nextDayMetricIDs.Load().(*byDateMetricIDEntry)
s.pendingNextDayMetricIDsLock.Lock()
pendingMetricIDs := s.pendingNextDayMetricIDs
s.pendingNextDayMetricIDs = &uint64set.Set{}
s.pendingNextDayMetricIDsLock.Unlock()
if pendingMetricIDs.Len() == 0 && e.date == date {
// Fast path: nothing to update.
return
}
// Slow path: union pendingMetricIDs with e.v
if e.date == date {
pendingMetricIDs.Union(&e.v)
}
eNew := &byDateMetricIDEntry{
date: date,
v: *pendingMetricIDs,
}
s.nextDayMetricIDs.Store(eNew)
}
func (s *Storage) updateCurrHourMetricIDs() {
hm := s.currHourMetricIDs.Load().(*hourMetricIDs)
s.pendingHourEntriesLock.Lock()
newMetricIDs := s.pendingHourEntries
s.pendingHourEntries = &uint64set.Set{}
s.pendingHourEntriesLock.Unlock()
hour := uint64(timestampFromTime(time.Now())) / msecPerHour
hour := fasttime.UnixHour()
if newMetricIDs.Len() == 0 && hm.hour == hour {
// Fast path: nothing to update.
return

Some files were not shown because too many files have changed in this diff Show More