deployment/docker: update Go builder from Go1.13.8 to Go1.14.0

app/vminsert: properly initialize InsertCtx
This should prevent from panic described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/339
2026-05-17 08:36:55 +03:00 · 2020-02-26 22:14:43 +02:00 · 2020-02-26 21:21:02 +02:00 · 2020-02-26 20:58:26 +02:00 · 2020-02-26 20:46:24 +02:00 · 2020-02-26 20:40:34 +02:00
1683 changed files with 497471 additions and 44872 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -0,0 +1,41 @@
+name: main
+on:
+  - push
+  - pull_request
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Setup Go
+        uses: actions/setup-go@v1
+        with:
+          go-version: 1.13
+        id: go
+      - name: Code checkout
+        uses: actions/checkout@v1
+      - name: Dependencies
+        env:
+          GO111MODULE: off
+        run: |
+          go get -v golang.org/x/lint/golint
+          go get -u github.com/kisielk/errcheck
+      - name: Build
+        env:
+          GO111MODULE: on
+        run: |
+            export PATH=$PATH:$(go env GOPATH)/bin # temporary fix. See https://github.com/actions/setup-go/issues/14
+            make check-all
+            git diff --exit-code
+            make test-full
+            make test-pure
+            make test-full-386
+            make vminsert vmselect vmstorage
+            make vminsert-pure vmselect-pure vmstorage-pure
+            make vmutils
+            GOOS=freebsd go build -mod=vendor ./app/vminsert
+            GOOS=freebsd go build -mod=vendor ./app/vmselect
+            GOOS=freebsd go build -mod=vendor ./app/vmstorage
+            GOOS=darwin go build -mod=vendor ./app/vminsert
+            GOOS=darwin go build -mod=vendor ./app/vmselect
+            GOOS=darwin go build -mod=vendor ./app/vmstorage
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,13 @@
 *.swp
 /gocache-for-docker
 /victoria-metrics-data
+/vmagent-remotewrite-data
 /vmstorage-data
 /vmselect-cache
+.DS_Store
+
+
+### terraform
+terraform.tfstate
+terraform.tfstate.*
+.terraform/
--- a/2
+++ b/2
@@ -175,7 +175,7 @@

   END OF TERMS AND CONDITIONS

-   Copyright 2019 VictoriaMetrics, Inc.
+   Copyright 2019-2020 VictoriaMetrics, Inc.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/113
+++ b/113
@@ -1,7 +1,7 @@
 PKG_PREFIX := github.com/VictoriaMetrics/VictoriaMetrics

 BUILDINFO_TAG ?= $(shell echo $$(git describe --long --all | tr '/' '-')$$( \
-	      git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | sha1sum | grep -oP '^.{8}')))
+	      git diff-index --quiet HEAD -- || echo '-dirty-'$$(git diff-index -u HEAD | openssl sha1 | cut -c 10-17)))

 PKG_TAG ?= $(shell git tag -l --points-at HEAD)
 ifeq ($(PKG_TAG),)
@@ -11,7 +11,20 @@ endif
 GO_BUILDINFO = -X '$(PKG_PREFIX)/lib/buildinfo.Version=$(APP_NAME)-$(shell date -u +'%Y%m%d-%H%M%S')-$(BUILDINFO_TAG)'

 all: \
-	victoria-metrics-prod
+	vminsert \
+	vmselect \
+	vmstorage \
+	vmagent \
+	vmbackup \
+	vmrestore
+
+all-pure: \
+	vminsert-pure \
+	vmselect-pure \
+	vmstorage-pure \
+	vmagent-pure \
+	vmbackup-pure \
+	vmrestore-pure

 include app/*/Makefile
 include deployment/*/Makefile
@@ -19,46 +32,118 @@ include deployment/*/Makefile
 clean:
 	rm -rf bin/*

-release: victoria-metrics-prod
-	cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz victoria-metrics-prod
+publish: \
+	publish-vminsert \
+	publish-vmselect \
+	publish-vmstorage \
+	publish-vmagent \
+	publish-vmbackup \
+	publish-vmrestore
+
+package: \
+	package-vminsert \
+	package-vmselect \
+	package-vmstorage \
+	package-vmagent \
+	package-vmbackup \
+	package-vmrestore
+
+vmutils: \
+	vmagent \
+	vmbackup \
+	vmrestore
+
+release: \
+	release-vmcluster \
+	release-vmutils
+
+release-vmcluster: \
+	vminsert-prod \
+	vmselect-prod \
+	vmstorage-prod
+	cd bin && tar czf victoria-metrics-$(PKG_TAG).tar.gz vminsert-prod vmselect-prod vmstorage-prod && \
+		sha256sum victoria-metrics-$(PKG_TAG).tar.gz > victoria-metrics-$(PKG_TAG)_checksums.txt
+
+release-vmutils: \
+	vmagent-prod \
+	vmbackup-prod \
+	vmrestore-prod
+	cd bin && tar czf vmutils-$(PKG_TAG).tar.gz vmagent-prod vmbackup-prod vmrestore-prod && \
+		sha256sum vmutils-$(PKG_TAG).tar.gz > vmutils-$(PKG_TAG)_checksums.txt
+
+pprof-cpu:
+	go tool pprof -trim_path=github.com/VictoriaMetrics/VictoriaMetrics@ $(PPROF_FILE)

 fmt:
-	go fmt $(PKG_PREFIX)/lib/...
-	go fmt $(PKG_PREFIX)/app/...
+	GO111MODULE=on gofmt -l -w -s ./lib
+	GO111MODULE=on gofmt -l -w -s ./app

 vet:
-	go vet $(PKG_PREFIX)/lib/...
-	go vet $(PKG_PREFIX)/app/...
+	GO111MODULE=on go vet -mod=vendor ./lib/...
+	GO111MODULE=on go vet -mod=vendor ./app/...

 lint: install-golint
 	golint lib/...
 	golint app/...

 install-golint:
-	which golint || GO111MODULE=off go get -u github.com/golang/lint/golint
+	which golint || GO111MODULE=off go get -u golang.org/x/lint/golint

 errcheck: install-errcheck
 	errcheck -exclude=errcheck_excludes.txt ./lib/...
 	errcheck -exclude=errcheck_excludes.txt ./app/vminsert/...
 	errcheck -exclude=errcheck_excludes.txt ./app/vmselect/...
 	errcheck -exclude=errcheck_excludes.txt ./app/vmstorage/...
+	errcheck -exclude=errcheck_excludes.txt ./app/vmagent/...
+	errcheck -exclude=errcheck_excludes.txt ./app/vmbackup/...
+	errcheck -exclude=errcheck_excludes.txt ./app/vmrestore/...

 install-errcheck:
 	which errcheck || GO111MODULE=off go get -u github.com/kisielk/errcheck

+check-all: fmt vet lint errcheck golangci-lint
+
 test:
-	go test $(PKG_PREFIX)/lib/...
+	GO111MODULE=on go test -mod=vendor ./lib/... ./app/...
+
+test-pure:
+	GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor ./lib/... ./app/...
+
+test-full:
+	GO111MODULE=on go test -mod=vendor -coverprofile=coverage.txt -covermode=atomic ./lib/... ./app/...
+
+test-full-386:
+	GO111MODULE=on GOARCH=386 go test -mod=vendor -coverprofile=coverage.txt -covermode=atomic ./lib/... ./app/...

 benchmark:
-	go test -bench=. $(PKG_PREFIX)/lib/...
+	GO111MODULE=on go test -mod=vendor -bench=. ./lib/...
+	GO111MODULE=on go test -mod=vendor -bench=. ./app/...
+
+benchmark-pure:
+	GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor -bench=. ./lib/...
+	GO111MODULE=on CGO_ENABLED=0 go test -mod=vendor -bench=. ./app/...

 vendor-update:
-	go get -u
-	go mod tidy
-	go mod vendor
+	GO111MODULE=on go get -u ./lib/...
+	GO111MODULE=on go get -u ./app/...
+	GO111MODULE=on go mod tidy
+	GO111MODULE=on go mod vendor
+
+app-local:
+	CGO_ENABLED=1 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)
+
+app-local-pure:
+	CGO_ENABLED=0 GO111MODULE=on go build $(RACE) -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/$(APP_NAME)-pure$(RACE) $(PKG_PREFIX)/app/$(APP_NAME)

 quicktemplate-gen: install-qtc
 	qtc

 install-qtc:
 	which qtc || GO111MODULE=off go get -u github.com/valyala/quicktemplate/qtc
+
+
+golangci-lint: install-golangci-lint
+	golangci-lint run --exclude '(SA4003|SA1019):' -D errcheck -D structcheck
+
+install-golangci-lint:
+	which golangci-lint || GO111MODULE=off go get -u github.com/golangci/golangci-lint/cmd/golangci-lint
--- a/README.md
+++ b/README.md
@@ -1,386 +1,286 @@
-<img  text-align="center" alt="Victoria Metrics" src="logo.png">
+<img alt="Victoria Metrics" src="logo.png">

-## Single-node VictoriaMetrics
+# Cluster version

-[![Latest Release](https://img.shields.io/github/release/VictoriaMetrics/VictoriaMetrics.svg?style=flat-square)](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest)
+VictoriaMetrics is fast, cost-effective and scalable time series database. It can be used as a long-term remote storage for Prometheus.

-VictoriaMetrics is a long-term remote storage for Prometheus.
-It is available in [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases),
-[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) and
-in [source code](https://github.com/VictoriaMetrics/VictoriaMetrics).
+It is recommended using [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics) instead of cluster version
+for ingestion rates lower than a million of data points per second.
+Single-node version [scales perfectly](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae)
+with the number of CPU cores, RAM and available storage space.
+Single-node version is easier to configure and operate comparing to cluster version, so think twice before sticking to cluster version.

-Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
+Join [our Slack](http://slack.victoriametrics.com/) or [contact us](mailto:info@victoriametrics.com) with consulting and support questions.


 ## Prominent features

-* Supports [Prometheus querying API](https://prometheus.io/docs/prometheus/latest/querying/api/), so it can be used as Prometheus drop-in replacement in Grafana.
-  Additionally, VictoriaMetrics extends PromQL with opt-in [useful features](https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/ExtendedPromQL).
-* High performance and good scalability for both [inserts](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b)
-  and [selects](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4).
-  [Outperforms InfluxDB and TimescaleDB by up to 20x](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
-* [Uses 10x less RAM than InfluxDB](https://medium.com/@valyala/insert-benchmarks-with-inch-influxdb-vs-victoriametrics-e31a41ae2893) when working with millions of unique time series (aka high cardinality).
-* High data compression, so [up to 70x more data points](https://medium.com/@valyala/when-size-matters-benchmarking-victoriametrics-vs-timescale-and-influxdb-6035811952d4)
-  may be crammed into a limited storage comparing to TimescaleDB.
-* Optimized for storage with high-latency IO and low iops (HDD and network storage in AWS, Google Cloud, Microsoft Azure, etc). See [graphs from these benchmarks](https://medium.com/@valyala/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b).
-* A single-node VictoriaMetrics may substitute moderately sized clusters built with competing solutions such as Thanos, Uber M3, Cortex, InfluxDB or TimescaleDB.
-  See [vertical scalability benchmarks](https://medium.com/@valyala/measuring-vertical-scalability-for-time-series-databases-in-google-cloud-92550d78d8ae).
-* Easy operation:
-  * VictoriaMetrics consists of a single executable without external dependencies.
-  * All the configuration is done via explicit command-line flags with reasonable defaults.
-  * All the data is stored in a single directory pointed by `-storageDataPath` flag.
-  * Easy backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
-* Storage is protected from corruption on unclean shutdown (i.e. hardware reset or `kill -9`) thanks to [the storage architecture](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282).
-* Supports metrics' ingestion and backfilling via the following protocols:
-  * [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
-  * [InfluxDB line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
-  * [Graphite plaintext protocol](https://graphite.readthedocs.io/en/latest/feeding-carbon.html) with [tags](https://graphite.readthedocs.io/en/latest/tags.html#carbon)
-    if `-graphiteListenAddr` is set.
-  * [OpenTSDB put message](http://opentsdb.net/docs/build/html/api_telnet/put.html) if `-opentsdbListenAddr` is set.
-* Ideally works with big amounts of time series data from IoT sensors, connected car sensors and industrial sensors.
-* Has open source [cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).
+- Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics).
+- Performance and capacity scales horizontally.
+- Supports multiple independent namespaces for time series data (aka multi-tenancy).


-## Operation
+## Architecture overview
+
+VictoriaMetrics cluster consists of the following services:
+
+- `vmstorage` - stores the data
+- `vminsert` - proxies the ingested data to `vmstorage` shards using consistent hashing
+- `vmselect` - performs incoming queries using the data from `vmstorage`
+
+Each service may scale independently and may run on the most suitable hardware.
+`vmstorage` nodes don't know about each other, don't communicate with each other and don't share any data.
+This is [shared nothing architecture](https://en.wikipedia.org/wiki/Shared-nothing_architecture).
+It increases cluster availability, simplifies cluster maintenance and cluster scaling.
+
+<img src="https://docs.google.com/drawings/d/e/2PACX-1vTvk2raU9kFgZ84oF-OKolrGwHaePhHRsZEcfQ1I_EC5AB_XPWwB392XshxPramLJ8E4bqptTnFn5LL/pub?w=1104&amp;h=746">


-### Table of contents
+## Binaries

-* [How to build from sources](#how-to-build-from-sources)
-* [How to start VictoriaMetrics](#how-to-start-victoriametrics)
-* [Prometheus setup](#prometheus-setup)
-* [Grafana setup](#grafana-setup)
-* [How to send data from InfluxDB-compatible agents such as Telegraf](#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf)
-* [How to send data from Graphite-compatible agents such as StatsD](#how-to-send-data-from-graphite-compatible-agents-such-as-statsd)
-* [How to send data from OpenTSDB-compatible agents](#how-to-send-data-from-opentsdb-compatible-agents)
-* [How to apply new config / ugrade VictoriaMetrics](#how-to-apply-new-config--upgrade-victoriametrics)
-* [How to work with snapshots](#how-to-work-with-snapshots)
-* [How to delete time series](#how-to-delete-time-series)
-* [How to export time series](#how-to-export-time-series)
-* [Federation](#federation)
-* [Capacity planning](#capacity-planning)
-* [High Availability](#high-availability)
-* [Multiple retentions](#multiple-retentions)
-* [Scalability and cluster version](#scalability-and-cluster-version)
-* [Security](#security)
-* [Tuning](#tuning)
-* [Monitoring](#monitoring)
-* [Troubleshooting](#troubleshooting)
-* [Community and contributions](#community-and-contributions)
-* [Reporting bugs](#reporting-bugs)
+Compiled binaries for cluster version are available in the `assets` section of [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases).
+See archives containing `cluster` word.
+
+Docker images for cluster version are available here:
+
+- `vminsert` - https://hub.docker.com/r/victoriametrics/vminsert/tags
+- `vmselect` - https://hub.docker.com/r/victoriametrics/vmselect/tags
+- `vmstorage` - https://hub.docker.com/r/victoriametrics/vmstorage/tags


-### How to build from sources
+## Building from sources

-We recommend using either [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) or
-[docker images](https://hub.docker.com/r/valyala/victoria-metrics/) instead of building VictoriaMetrics
-from sources. Building from sources is reasonable when developing an additional features specific
-to your needs.
+Source code for cluster version is available at [cluster branch](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster).


-#### Development build
+### Development Builds

-1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
-2. Run `go build ./app/victoria-metrics` from the root folder of the repository.
-   It will build `victoria-metrics` binary in the root folder of the repository.
+1. [Install go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
+2. Run `make` from the repository root. It should build `vmstorage`, `vmselect`
+   and `vminsert` binaries and put them into the `bin` folder.

-#### Production build

-1. [Install docker](https://docs.docker.com/install/).
-2. Run `make victoria-metrics-prod` from the root folder of the respository.
-   It will build `victoria-metrics-prod` binary and put it into the `bin` folder.
+### Production builds

-#### Building docker images
+There is no need in installing Go on a host system since binaries are built
+inside [the official docker container for Go](https://hub.docker.com/_/golang).
+This makes reproducible builds.
+So [install docker](https://docs.docker.com/install/) and run the following command:
+
+```
+make vminsert-prod vmselect-prod vmstorage-prod
+```
+
+Production binaries are built into statically linked binaries for `GOARCH=amd64`, `GOOS=linux`.
+They are put into `bin` folder with `-prod` suffixes:
+```
+$ make vminsert-prod vmselect-prod vmstorage-prod
+$ ls -1 bin
+vminsert-prod
+vmselect-prod
+vmstorage-prod
+```
+
+### Building docker images
+
+Run `make package`. It will build the following docker images locally:
+
+* `victoriametrics/vminsert:<PKG_TAG>`
+* `victoriametrics/vmselect:<PKG_TAG>`
+* `victoriametrics/vmstorage:<PKG_TAG>`

-Run `make package-victoria-metrics`. It will build `valyala/victoria-metrics:<PKG_TAG>` docker image locally.
 `<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
 The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package`.



-### How to start VictoriaMetrics
+## Operation

-Just start VictoriaMetrics executable or docker image with the desired command-line flags.
+### Cluster setup

-The following command line flags are used the most:
+A minimal cluster must contain the following nodes:

-* `-storageDataPath` - path to data directory. VictoriaMetrics stores all the data in this directory.
-* `-retentionPeriod` - retention period in months for the data. Older data is automatically deleted.
-* `-httpListenAddr` - TCP address to listen to for http requests. By default it listens port `8428` on all the network interfaces.
-* `-graphiteListenAddr` - TCP and UDP address to listen to for Graphite data. By default it is disabled.
-* `-opentsdbListenAddr` - TCP and UDP address to listen to for OpenTSDB data. By default it is disabled.
+* a single `vmstorage` node with `-retentionPeriod` and `-storageDataPath` flags
+* a single `vminsert` node with `-storageNode=<vmstorage_host>:8400`
+* a single `vmselect` node with `-storageNode=<vmstorage_host>:8401`

-Pass `-help` to see all the available flags with description and default values.
+It is recommended to run at least two nodes for each service
+for high availability purposes.
+
+An http load balancer must be put in front of `vminsert` and `vmselect` nodes:
+- requests starting with `/insert` must be routed to port `8480` on `vminsert` nodes.
+- requests starting with `/select` must be routed to port `8481` on `vmselect` nodes.
+
+Ports may be altered by setting `-httpListenAddr` on the corresponding nodes.
+
+It is recommended setting up [monitoring](#monitoring) for the cluster.
+
+#### Environment variables
+
+Each flag values can be set thru environment variables by following these rules:
+
+- The `-envflag.enable` flag must be set
+- Each `.` in flag names must be substituted by `_` (for example `-insert.maxQueueDuration <duration>` will translate to `insert_maxQueueDuration=<duration>`)
+- For repeating flags, an alternative syntax can be used by joining the different values into one using `,` as separator (for example `-storageNode <nodeA> -storageNode <nodeB>` will translate to `storageNode=<nodeA>,<nodeB>`)
+
+### Monitoring
+
+All the cluster components expose various metrics in Prometheus-compatible format at `/metrics` page on the TCP port set in `-httpListenAddr` command-line flag.
+By default the following TCP ports are used:
+- `vminsert` - 8480
+- `vmselect` - 8481
+- `vmstorage` - 8482
+
+It is recommended setting up Prometheus to scrape `/metrics` pages from all the cluster components, so they can be monitored and analyzed
+with [the official Grafana dashboard for VictoriaMetrics cluster](https://grafana.com/grafana/dashboards/11176).


-### Prometheus setup
+### URL format

-Add the following lines to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`):
+* URLs for data ingestion: `http://<vminsert>:8480/insert/<accountID>/<suffix>`, where:
+  - `<accountID>` is an arbitrary number identifying namespace for data ingestion (aka tenant)
+  - `<suffix>` may have the following values:
+     - `prometheus` - for inserting data with [Prometheus remote write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write)
+     - `influx/write` or `influx/api/v2/write` - for inserting data with [Influx line protocol](https://docs.influxdata.com/influxdb/v1.7/write_protocols/line_protocol_tutorial/)
+     - `opentsdb/api/put` - for accepting [OpenTSDB HTTP /api/put requests](http://opentsdb.net/docs/build/html/api_http/put.html).
+     - `prometheus/api/v1/import` - for importing data obtained via `api/v1/export` on `vmselect` (see below).

-```yml
-remote_write:
-  - url: http://<victoriametrics-addr>:8428/api/v1/write
-    queue_config:
-      max_samples_per_send: 10000
-```
+* URLs for querying: `http://<vmselect>:8481/select/<accountID>/prometheus/<suffix>`, where:
+  - `<accountID>` is an arbitrary number identifying data namespace for the query (aka tenant)
+  - `<suffix>` may have the following values:
+    - `api/v1/query` - performs [PromQL instant query](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries)
+    - `api/v1/query_range` - performs [PromQL range query](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries)
+    - `api/v1/series` - performs [series query](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers)
+    - `api/v1/labels` - returns a [list of label names](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names)
+    - `api/v1/label/<label_name>/values` - returns values for the given `<label_name>` according [to API](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values)
+    - `federate` - returns [federated metrics](https://prometheus.io/docs/prometheus/latest/federation/)
+    - `api/v1/export` - exports raw data. See [this article](https://medium.com/@valyala/analyzing-prometheus-data-with-external-tools-5f3e5e147639) for details

-Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
-Then apply the new config via the following command:
+* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
+  Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't
+  be used on a regular basis, since it carries non-zero overhead.

-```
-kill -HUP `pidof prometheus`
-```
+* `vmstorage` nodes provide the following HTTP endpoints on `8482` port:
+  - `/snapshot/create` - create [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282),
+    which can be used for backups in background. Snapshots are created in `<storageDataPath>/snapshots` folder, where `<storageDataPath>` is the corresponding
+    command-line flag value.
+  - `/snapshot/list` - list available snasphots.
+  - `/snapshot/delete?snapshot=<id>` - delete the given snapshot.
+  - `/snapshot/delete_all` - delete all the snapshots.

-Prometheus writes incoming data to local storage and to remote storage in parallel.
-This means the data remains available in local storage for `--storage.tsdb.retention.time` duration
-if remote storage stops working.
-
-If you plan sending data to VictoriaMetrics from multiple Prometheus instances, then add the following lines into `global` section
-of [Prometheus config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#configuration-file):
-
-```yml
-global:
-  external_labels:
-    datacenter: dc-123
-```
-
-This instructs Prometheus to add `datacenter=dc-123` label to each time series sent to remote storage.
-The label name may be arbitrary - `datacenter` is just an example. The label value must be unique
-across Prometheus instances, so time series may be filtered and grouped by this label.
+  Snapshots may be created independently on each `vmstorage` node. There is no need in synchronizing snapshots' creation
+  across `vmstorage` nodes.


-### Grafana setup
+### Cluster resizing and scalability.

-Create [Prometheus datasource](http://docs.grafana.org/features/datasources/prometheus/) in Grafana with the following Url:
+Cluster performance and capacity scales with adding new nodes.

-```
-http://<victoriametrics-addr>:8428
-```
+* `vminsert` and `vmselect` nodes are stateless and may be added / removed at any time.
+  Do not forget updating the list of these nodes on http load balancer.
+  Adding more `vminsert` nodes scales data ingestion rate. See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/175#issuecomment-536925841)
+  about ingestion rate scalability.
+  Adding more `vmselect` nodes scales select queries rate.
+* `vmstorage` nodes own the ingested data, so they cannot be removed without data loss.
+  Adding more `vmstorage` nodes scales cluster capacity.

-Substitute `<victoriametrics-addr>` with the hostname or IP address of VictoriaMetrics.
+Steps to add `vmstorage` node:

-Then build graphs with the created datasource using [Prometheus query language](https://prometheus.io/docs/prometheus/latest/querying/basics/).
-VictoriaMetrics supports native PromQL and [extends it with useful features](ExtendedPromQL).
+1. Start new `vmstorage` node with the same `-retentionPeriod` as existing nodes in the cluster.
+2. Gradually restart all the `vmselect` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8401`.
+3. Gradually restart all the `vminsert` nodes with new `-storageNode` arg containing `<new_vmstorage_host>:8400`.


-### How to send data from InfluxDB-compatible agents such as [Telegraf](https://www.influxdata.com/time-series-platform/telegraf/)?
+### Cluster availability

-Just use `http://<victoriametric-addr>:8428` url instead of InfluxDB url in agents' configs.
-For instance, put the following lines into `Telegraf` config, so it sends data to VictoriaMetrics instead of InfluxDB:
+* HTTP load balancer must stop routing requests to unavailable `vminsert` and `vmselect` nodes.
+* The cluster remains available if at least a single `vmstorage` node exists:

-```
-[[outputs.influxdb]]
-  urls = ["http://<victoriametrics-addr>:8428"]
-```
-
-Do not forget substituting `<victoriametrics-addr>` with the real address where VictoriaMetrics runs.
-
-VictoriaMetrics maps Influx data using the following rules:
-* [`db` query arg](https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint) is mapped into `db` label value
-* Field names are mapped to time series names prefixed by `{measurement}.` value
-* Field values are mapped to time series values
-* Tags are mapped to Prometheus labels as-is
+  - `vminsert` re-routes incoming data from unavailable `vmstorage` nodes to healthy `vmstorage` nodes
+  - `vmselect` continues serving partial responses if at least a single `vmstorage` node is available.


-### How to send data from Graphite-compatible agents such as [StatsD](https://github.com/etsy/statsd)?
+### Updating / reconfiguring cluster nodes

-1) Enable Graphite receiver in VictoriaMetrics by setting `-graphiteListenAddr` command line flag. For instance,
-the following command will enable Graphite receiver in VictoriaMetrics on TCP and UDP port `2003`:
+All the node types - `vminsert`, `vmselect` and `vmstorage` - may be updated via graceful shutdown.
+Send `SIGINT` signal to the corresponding process, wait until it finishes and then start new version
+with new configs.

-```
-/path/to/victoria-metrics-prod ... -graphiteListenAddr=:2003
-```
-
-2) Use the configured address in Graphite-compatible agents. For instance, set `graphiteHost`
-to the VictoriaMetrics host in `StatsD` configs.
-
-
-### How to send data from OpenTSDB-compatible agents?
-
-1) Enable OpenTSDB receiver in VictoriaMetrics by setting `-opentsdbListenAddr` command line flag. For instance,
-the following command will enable OpenTSDB receiver in VictoriaMetrics on TCP and UDP port `4242`:
-
-```
-/path/to/victoria-metrics-prod ... -opentsdbListenAddr=:4242
-```
-
-2) Send data to the given address from OpenTSDB-compatible agents.
-
-
-### How to apply new config / upgrade VictoriaMetrics?
-
-VictoriaMetrics must be restarted in order to upgrade or apply new config:
-
-1) Send `SIGINT` signal to VictoriaMetrics process in order to gracefully stop it.
-2) Wait until the process stops. This can take a few seconds.
-3) Start the upgraded VictoriaMetrics with new config.
-
-
-### How to work with snapshots?
-
-Navigate to `http://<victoriametrics-addr>:8428/snapshot/create` in order to create an instant snapshot.
-The page will return the following JSON response:
-
-```
-{"status":"ok","snapshot":"<snapshot-name>"}
-```
-
-Snapshots are created under `<-storageDataPath>/snapshots` directory, where `<-storageDataPath>`
-is the command-line flag value. Snapshots can be archived to backup storage via `rsync -L`, `scp -r`
-or any similar tool that follows symlinks during copying.
-
-The `http://<victoriametrics-addr>:8428/snapshot/list` page contains the list of available snapshots.
-
-Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete?snapshot=<snapshot-name>` in order
-to delete `<snapshot-name>` snapshot.
-
-Navigate to `http://<victoriametrics-addr>:8428/snapshot/delete_all` in order to delete all the snapshots.
-
-
-### How to delete time series?
-
-Send a request to `http://<victoriametrics-addr>:8428/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`,
-where `<timeseries_selector_for_delete>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
-for metrics to delete. After that all the time series matching the given selector are deleted. Storage space for
-the deleted time series isn't freed instantly - it is freed during subsequent merges of data files.
-
-
-### How to export time series?
-
-Send a request to `http://<victoriametrics-addr>:8428/api/v1/export?match[]=<timeseries_selector_for_export>`,
-where `<timeseries_selector_for_export>` may contain any [time series selector](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)
-for metrics to export. The response would contain all the data for the selected time series in [JSON streaming format](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON).
-Each JSON line would contain data for a single time series. An example output:
-
-```
-{"metric":{"__name__":"up","job":"node_exporter","instance":"localhost:9100"},"values":[0,0,0],"timestamps":[1549891472010,1549891487724,1549891503438]}
-{"metric":{"__name__":"up","job":"prometheus","instance":"localhost:9090"},"values":[1,1,1],"timestamps":[1549891461511,1549891476511,1549891491511]}
-```
-
-Optional `start` and `end` args may be added to the request in order to limit the time frame for the exported data. These args may contain either
-unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values.
-
-
-### Federation
-
-VictoriaMetrics exports [Prometheus-compatible federation data](https://prometheus.io/docs/prometheus/latest/federation/)
-at `http://<victoriametrics-addr>:8428/federate?match[]=<timeseries_selector_for_federation>`.
-
-Optional `start` and `end` args may be added to the request in order to scrape the last point for each selected time series on the `[start ... end]` interval.
-`start` and `end` may contain either unix timestamp in seconds or [RFC3339](https://www.ietf.org/rfc/rfc3339.txt) values. By default the last point
-on the interval `[now - max_lookback ... now]` is scraped for each time series. Default value for `max_lookback` is `5m` (5 minutes), but can be overriden.
-For instance, `/federate?match[]=up&max_lookback=1h` would return last points on the `[now - 1h ... now]` interval. This may be useful for time series federation
-with scrape intervals exceeding `5m`.
+Cluster should remain in working state if at least a single node of each type remains available during
+the update process. See [cluster availability](#cluster-availability) section for details.


 ### Capacity planning

-Rough estimation of the required resources:
+Each instance type - `vminsert`, `vmselect` and `vmstorage` - can run on the most suitable hardware.

-* RAM size: less than 1KB per active time series. So, ~1GB of RAM is required for 1M active time series.
-  Time series is considered active if new data points have been added to it recently or if it has been recently queried.
-  VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with `-memory.allowedPercent` flag.
-* CPU cores: a CPU core per 300K inserted data points per second. So, ~4 CPU cores are required for processing
-  the insert stream of 1M data points per second.
-  If you see lower numbers per CPU core, then it is likely active time series info doesn't fit caches,
-  so you need more RAM for lowering CPU usage.
-* Storage size: less than a byte per data point on average. So, ~260GB is required for storing a month-long insert stream
-  of 100K data points per second.
-  The actual storage size heavily depends on data randomness (entropy). Higher randomness means higher storage size requirements.
+#### vminsert
+
+* The recommended total number of vCPU cores for all the `vminsert` instances can be calculated from the ingestion rate: `vCPUs = ingestion_rate / 150K`.
+* The recommended number of vCPU cores per each `vminsert` instance should equal to the number of `vmstorage` instances in the cluster.
+* The amount of RAM per each `vminsert` instance should be 1GB or more. RAM is used as a buffer for spikes in ingestion rate.
+* Sometimes `-rpc.disableCompression` command-line flag on `vminsert` instances could increase ingestion capacity at the cost
+  of higher network bandwidth usage between `vminsert` and `vmstorage`.
+
+#### vmstorage
+
+* The recommended total number of vCPU cores for all the `vmstorage` instances can be calculated from the ingestion rate: `vCPUs = ingestion_rate / 150K`.
+* The recommended total amount of RAM for all the `vmstorage` instances can be calculated from the number of active time series: `RAM = active_time_series * 1KB`.
+  Time series is active if it received at least a single data point during the last hour or if it has been queried during the last hour.
+* The recommended total amount of storage space for all the `vmstorage` instances can be calculated
+  from the ingestion rate and retention: `storage_space = ingestion_rate * retention_seconds`.
+
+#### vmselect
+
+The recommended hardware for `vmselect` instances highly depends on the type of queries. Lightweight queries over small number of time series usually require
+small number of vCPU cores and small amount of RAM on `vmselect`, while heavy queries over big number of time series (>10K) usually require
+bigger number of vCPU cores and bigger amounts of RAM.


-### High availability
+### Helm

-1) Install multiple VictoriaMetrics instances in distinct datacenters.
-2) Add addresses of these instances to `remote_write` section in Prometheus config:
+Helm chart simplifies managing cluster version of VictoriaMetrics in Kubernetes.
+It is available in the [helm-charts](https://github.com/VictoriaMetrics/helm-charts) repository.

-```yml
-remote_write:
-  - url: http://<victoriametrics-addr-1>:8428/api/v1/write
-    queue_config:
-      max_samples_per_send: 10000
-  # ...
-  - url: http://<victoriametrics-addr-N>:8428/api/v1/write
-    queue_config:
-      max_samples_per_send: 10000
-```
-
-3) Apply the updated config:
-
-```
-kill -HUP `pidof prometheus`
-```
-
-4) Now Prometheus should write data into all the configured `remote_write` urls in parallel.
-5) Set up [Promxy](https://github.com/jacksontj/promxy) in front of all the VictoriaMetrics replicas.
-6) Set up Prometheus datasource in Grafana that points to Promxy.
+Upgrade follows `Cluster resizing procedure` under the hood.


-### Multiple retentions
+### Replication and data safety

-Just start multiple VictoriaMetrics instances with distinct values for the following flags:
+VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`.
+It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs),
+since they are protected from data loss and data corruption. They also provide consistently high performance
+and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime.
+HDD-based persistent disks should be enough for the majority of use cases.

-* `-retentionPeriod`
-* `-storageDataPath`, so the data for each retention period is saved in a separate directory
-* `-httpListenAddr`, so clients may reach VictoriaMetrics instance with proper retention
+It is recommended using durable replicated persistent volumes in Kubernetes.
+
+Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883).


-### Scalability and cluster version
+### Backups

-Though single-node VictoriaMetrics cannot scale to multiple nodes, it is optimized for resource usage - storage size / bandwidth / IOPS, RAM, CPU.
-This means that a single-node VictoriaMetrics may scale vertically and substitute moderately sized cluster built with competing solutions
-such as Thanos, Uber M3, InfluxDB or TimescaleDB.
+It is recommended performing periodical backups from [instant snapshots](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282)
+for protecting from user errors such as accidental data deletion.

-So try single-node VictoriaMetrics at first and then [switch to cluster version](https://github.com/VictoriaMetrics/VictoriaMetrics/tree/cluster) if you still need
-horizontally scalable long-term remote storage for really large Prometheus deployments.
-[Contact us](mailto:info@victoriametrics.com) for paid support.
+The following steps must be performed for each `vmstorage` node for creating a backup:

+1. Create an instant snapshot by navigating to `/snapshot/create` HTTP handler. It will create snapshot and return its name.
+2. Archive the created snapshot from `<-storageDataPath>/snapshots/<snapshot_name>` folder using [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/app/vmbackup/README.md).
+   The archival process doesn't interfere with `vmstorage` work, so it may be performed at any suitable time.
+3. Delete unused snapshots via `/snapshot/delete?snapshot=<snapshot_name>` or `/snapshot/delete_all` in order to free up occupied storage space.

-### Security
+There is no need in synchronizing backups among all the `vmstorage` nodes.

-Do not forget protecting sensitive endpoints in VictoriaMetrics when exposing it to untrusted networks such as internet.
-Consider setting the following command-line flags:
+Restoring from backup:

-* `-tls`, `-tlsCertFile` and `-tlsKeyFile` for switching from HTTP to HTTPS.
-* `-httpAuth.username` and `-httpAuth.password` for protecting all the HTTP endpoints
-  with [HTTP Basic Authentication](https://en.wikipedia.org/wiki/Basic_access_authentication).
-* `-deleteAuthKey` for protecting `/api/v1/admin/tsdb/delete_series` endpoint. See [how to delete time series](#how-to-delete-time-series).
-* `-snapshotAuthKey` for protecting `/snapshot*` endpoints. See [how to work with snapshots](#how-to-work-with-snapshots).
-
-Explicitly set internal network interface for TCP and UDP ports for data ingestion with Graphite and OpenTSDB formats.
-For example, substitute `-graphiteListenAddr=:2003` with `-graphiteListenAddr=<internal_iface_ip>:2003`.
-
-
-### Tuning
-
-* There is no need in VictoriaMetrics tuning, since it uses reasonable defaults for command-line flags,
-  which are automatically adjusted for the available CPU and RAM resources.
-* There is no need in Operating System tuning, since VictoriaMetrics is optimized for default OS settings.
-  The only option is increasing the limit on [the number open files in the OS](https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a),
-  so Prometheus instances could establish more connections to VictoriaMetrics.
-
-
-### Monitoring
-
-VictoriaMetrics exports internal metrics in Prometheus format on the `/metrics` page.
-Add this page to Prometheus' scrape config in order to collect VictoriaMetrics metrics.
-There is [an official Grafana dashboard for single-node VictoriaMetrics](https://grafana.com/dashboards/10229).
-
-
-### Troubleshooting
-
-* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
-  then it is likely you have too many active time series for the current amount of RAM.
-  It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
-  ingestion performance.
-  Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
-  option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
+1. Stop `vmstorage` node with `kill -INT`.
+2. Restore data from backup using [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/cluster/app/vmrestore/README.md) into `-storageDataPath` directory.
+3. Start `vmstorage` node.


 ## Community and contributions

-Feel free asking any questions regarding VictoriaMetrics [here](https://groups.google.com/forum/#!forum/victorametrics-users).
-
 We are open to third-party pull requests provided they follow [KISS design principle](https://en.wikipedia.org/wiki/KISS_principle):

 - Prefer simple code and architecture.
@@ -392,6 +292,17 @@ We are open to third-party pull requests provided they follow [KISS design princ

 Adhering `KISS` principle simplifies the resulting code and architecture, so it can be reviewed, understood and verified by many people.

+Due to `KISS` cluster version of VictoriaMetrics has no the following "features" popular in distributed computing world:
+
+- Fragile gossip protocols. See [failed attempt in Thanos](https://github.com/improbable-eng/thanos/blob/030bc345c12c446962225221795f4973848caab5/docs/proposals/completed/201809_gossip-removal.md).
+- Hard-to-understand-and-implement-properly [Paxos protocols](https://www.quora.com/In-distributed-systems-what-is-a-simple-explanation-of-the-Paxos-algorithm).
+- Complex replication schemes, which may go nuts in unforesseen edge cases. The replication is offloaded to the underlying durable replicated storage
+  such as [persistent disks in Google Compute Engine](https://cloud.google.com/compute/docs/disks/#pdspecs).
+- Automatic data reshuffling between storage nodes, which may hurt cluster performance and availability.
+- Automatic cluster resizing, which may cost you a lot of money if improperly configured.
+- Automatic discovering and addition of new nodes in the cluster, which may mix data between dev and prod clusters :)
+- Automatic leader election, which may result in split brain disaster on network errors.
+

 ## Reporting bugs

--- a/app/victoria-metrics/Makefile
+++ b/app/victoria-metrics/Makefile
@@ -1,21 +0,0 @@
-# All these commands must run from repository root.
-
-victoria-metrics-prod:
-	APP_NAME=victoria-metrics $(MAKE) app-via-docker
-
-package-victoria-metrics:
-	APP_NAME=victoria-metrics \
-	$(MAKE) package-via-docker
-
-publish-victoria-metrics:
-	APP_NAME=victoria-metrics $(MAKE) publish-via-docker
-
-run-victoria-metrics:
-	mkdir -p victoria-metrics-data
-	DOCKER_OPTS='-v $(shell pwd)/victoria-metrics-data:/victoria-metrics-data -p 8428:8428 -p 2003:2003 -p 2003:2003/udp' \
-	APP_NAME=victoria-metrics \
-	ARGS='-graphiteListenAddr=:2003 -opentsdbListenAddr=:4242 -retentionPeriod=12 -search.maxUniqueTimeseries=1000000 -search.maxQueryDuration=10m' \
-	$(MAKE) run-via-docker
-
-victoria-metrics-arm:
-	CC=arm-linux-gnueabi-gcc CGO_ENABLED=1 GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/victoria-metrics-arm ./app/victoria-metrics
--- a/app/victoria-metrics/deployment/Dockerfile
+++ b/app/victoria-metrics/deployment/Dockerfile
@@ -1,5 +0,0 @@
-FROM scratch
-COPY --from=local/certs:1.0.2 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
-COPY bin/victoria-metrics-prod .
-EXPOSE 8428
-ENTRYPOINT ["/victoria-metrics-prod"]
--- a/app/victoria-metrics/main.go
+++ b/app/victoria-metrics/main.go
@@ -1,60 +0,0 @@
-package main
-
-import (
-	"flag"
-	"net/http"
-	"time"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
-)
-
-var httpListenAddr = flag.String("httpListenAddr", ":8428", "TCP address to listen for http connections")
-
-func main() {
-	flag.Parse()
-	buildinfo.Init()
-	logger.Init()
-	logger.Infof("starting VictoraMetrics at %q...", *httpListenAddr)
-	startTime := time.Now()
-	vmstorage.Init()
-	vmselect.Init()
-	vminsert.Init()
-
-	go httpserver.Serve(*httpListenAddr, requestHandler)
-	logger.Infof("started VictoriaMetrics in %s", time.Since(startTime))
-
-	sig := procutil.WaitForSigterm()
-	logger.Infof("received signal %s", sig)
-
-	logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
-	startTime = time.Now()
-	if err := httpserver.Stop(*httpListenAddr); err != nil {
-		logger.Fatalf("cannot stop the webservice: %s", err)
-	}
-	vminsert.Stop()
-	logger.Infof("successfully shut down the webservice in %s", time.Since(startTime))
-
-	vmstorage.Stop()
-	vmselect.Stop()
-
-	logger.Infof("the VictoriaMetrics has been stopped in %s", time.Since(startTime))
-}
-
-func requestHandler(w http.ResponseWriter, r *http.Request) bool {
-	if vminsert.RequestHandler(w, r) {
-		return true
-	}
-	if vmselect.RequestHandler(w, r) {
-		return true
-	}
-	if vmstorage.RequestHandler(w, r) {
-		return true
-	}
-	return false
-}
--- a/app/vmagent/Makefile
+++ b/app/vmagent/Makefile
@@ -0,0 +1,73 @@
+# All these commands must run from repository root.
+
+vmagent:
+	APP_NAME=vmagent $(MAKE) app-local
+
+vmagent-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker
+
+vmagent-pure-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-pure
+
+vmagent-amd64-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-amd64
+
+vmagent-arm-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-arm
+
+vmagent-arm64-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-arm64
+
+vmagent-ppc64le-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-ppc64le
+
+vmagent-386-prod:
+	APP_NAME=vmagent $(MAKE) app-via-docker-386
+
+package-vmagent:
+	APP_NAME=vmagent $(MAKE) package-via-docker
+
+package-vmagent-pure:
+	APP_NAME=vmagent $(MAKE) package-via-docker-pure
+
+package-vmagent-amd64:
+	APP_NAME=vmagent $(MAKE) package-via-docker-amd64
+
+package-vmagent-arm:
+	APP_NAME=vmagent $(MAKE) package-via-docker-arm
+
+package-vmagent-arm64:
+	APP_NAME=vmagent $(MAKE) package-via-docker-arm64
+
+package-vmagent-ppc64le:
+	APP_NAME=vmagent $(MAKE) package-via-docker-ppc64le
+
+package-vmagent-386:
+	APP_NAME=vmagent $(MAKE) package-via-docker-386
+
+publish-vmagent:
+	APP_NAME=vmagent $(MAKE) publish-via-docker
+
+run-vmagent:
+	mkdir -p vmagent-data
+	DOCKER_OPTS='-v $(shell pwd)/vmagent-data:/vmagent-data' \
+	APP_NAME=vmagent \
+	$(MAKE) run-via-docker
+
+vmagent-amd64:
+	CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-amd64 ./app/vmagent
+
+vmagent-arm:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-arm ./app/vmagent
+
+vmagent-arm64:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-arm64 ./app/vmagent
+
+vmagent-ppc64le:
+	CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-ppc64le ./app/vmagent
+
+vmagent-386:
+	CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmagent-386 ./app/vmagent
+
+vmagent-pure:
+	APP_NAME=vmagent $(MAKE) app-local-pure
--- a/app/vmagent/README.md
+++ b/app/vmagent/README.md
@@ -0,0 +1,170 @@
+## vmagent
+
+`vmagent` is a tiny but brave agent, which helps you collecting metrics from various sources
+and storing them to [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics).
+
+<img alt="vmagent" src="vmagent.png">
+
+
+### Motivation
+
+While VictoriaMetrics provides an efficient solution to store and observe metrics, our users needed something fast
+and RAM friendly to scrape metrics from Prometheus-compatible exporters to VictoriaMetrics.
+Also, we found that users’ infrastructure is like snowflakes - never alike, and we decided to add more flexibility
+to `vmagent` (like the ability to push metrics instead of pulling them). We did our best and plan to do even more.
+
+
+### Features
+
+* Can be used as drop-in replacement for Prometheus for scraping targets such as [node_exporter](https://github.com/prometheus/node_exporter).
+  See [Quick Start](#quick-start) for details.
+* Can add, remove and modify labels via Prometheus relabeling. See [these docs](#relabeling) for details.
+* Accepts data via all the ingestion protocols supported by VictoriaMetrics:
+  * Influx line protocol via `http://<vmagent>:8429/write`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf).
+  * JSON lines import protocol via `http://<vmagent>:8429/api/v1/import`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-import-time-series-data).
+  * Graphite plaintext protocol if `-graphiteListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-graphite-compatible-agents-such-as-statsd).
+  * OpenTSDB telnet and http protocols if `-opentsdbListenAddr` command-line flag is set. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-opentsdb-compatible-agents).
+  * Prometheus remote write protocol via `http://<vmagent>:8429/api/v1/write`.
+* Can replicate collected metrics simultaneously to multiple remote storage systems.
+* Works in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected metrics
+  are buffered at `-remoteWrite.tmpDataPath`. The buffered metrics are sent to remote storage as soon as connection
+  to remote storage is recovered.
+* Uses lower amounts of RAM, CPU, disk IO and network bandwidth comparing to Prometheus.
+
+
+### Quick Start
+
+Just download `vmutils-*` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases), unpack it
+and pass the following flags to `vmagent` binary in order to start scraping Prometheus targets:
+
+* `-promscrape.config` with the path to Prometheus config file (it is usually located at `/etc/prometheus/prometheus.yml`)
+* `-remoteWrite.url` with the remote storage endpoint such as VictoriaMetrics. Multiple `-remoteWrite.url` args can be set in parallel
+  in order to replicate data concurrently to multiple remote storage systems.
+
+Example command line:
+
+```
+/path/to/vmagent -promscrape.config=/path/to/prometheus.yml -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
+```
+
+If you need collecting only Influx data, then the following command line would be enough:
+
+```
+/path/to/vmagent -remoteWrite.url=https://victoria-metrics-host:8428/api/v1/write
+```
+
+Then send Influx data to `http://vmagent-host:8429/write`. See [these docs](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-send-data-from-influxdb-compatible-agents-such-as-telegraf) for more details.
+
+`vmagent` is also available in [docker images](https://hub.docker.com/r/victoriametrics/vmagent/).
+
+Pass `-help` to `vmagent` in order to see the full list of supported command-line flags with their descriptions.
+
+
+### How to collect metrics in Prometheus format?
+
+Pass the path to `prometheus.yml` to `-promscrape.config` command-line flag. `vmagent` takes into account the following
+sections from [Prometheus config file](https://prometheus.io/docs/prometheus/latest/configuration/configuration/):
+
+* `global`
+* `scrape_configs`
+
+All the other sections are ignored, including [remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) section.
+Use `-remoteWrite.*` command-line flags instead for configuring remote write settings:
+
+* `-remoteWrite.url` for pointing to remote storage. Data to remote storage can be sent either via HTTP or HTTPS. See `-remoteWrite.tls*` flags for details.
+* `-remoteWrite.label` for adding labels to metrics before sending them to remote storage.
+* `-remoteWrite.relabelConfig` for applying relabeling to metrics before sending them to remote storage.
+
+The following scrape types in [scrape_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) section are supported:
+
+* `static_configs` - for scraping statically defined targets. See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#static_config) for details.
+* `file_sd_configs` - for scraping targets defined in external files aka file-based service discover.
+  See [these docs](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config) for details.
+
+The following service discovery mechanisms will be added to `vmagent` soon:
+
+* [kubernetes_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config)
+* [ec2_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config)
+* [gce_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#gce_sd_config)
+* [consul_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config)
+* [dns_sd_config](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config)
+
+
+File feature requests at [our issue tracker](https://github.com/VictoriaMetrics/VictoriaMetrics/issues) if you need other service discovery mechanisms to be supported by `vmagent`.
+
+
+### Adding labels to metrics
+
+Labels can be added to metrics via the following mechanisms:
+
+* Via `global -> external_labels` section in `-promscrape.config` file. These labels are added only to metrics scraped from targets configured in `-promscrape.config` file.
+* Via `-remoteWrite.label` command-line flag. These labels are added to all the collected metrics before sending them to `-remoteWrite.url`.
+
+
+### Relabeling
+
+`vmagent` supports [Prometheus relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config).
+Additionally it provides the following extra actions:
+
+* `replace_all`: replaces all the occurences of `regex` in the values of `source_labels` with the `replacement` and stores the result in the `target_label`.
+* `labelmap_all`: replaces all the occurences of `regex` in all the label names with the `replacement`.
+
+The relabeling can be defined in the following places:
+
+* At `scrape_config -> relabel_configs` section in `-promscrape.config` file. This relabeling is applied to targets when parsing the file during `vmagent` startup
+  or during config reload after sending `SIGHUP` signal to `vmagent`  via `kill -HUP`.
+* At `scrape_config -> metric_relabel_configs` section in `-promscrape.config` file. This relabeling is applied to metrics after each scrape for the configured targets.
+* At `-remoteWrite.relabelConfig` file. This relabeling is aplied to all the collected metrics before sending them to `-remoteWrite.url`.
+
+Read more about relabeling in the following articles:
+
+* [Life of a label](https://www.robustperception.io/life-of-a-label)
+* [Discarding targets and timeseries with relabeling](https://www.robustperception.io/relabelling-can-discard-targets-timeseries-and-alerts)
+* [Dropping labels at scrape time](https://www.robustperception.io/dropping-metrics-at-scrape-time-with-prometheus)
+* [Extracting labels from legacy metric names](https://www.robustperception.io/extracting-labels-from-legacy-metric-names)
+* [relabel_configs vs metric_relabel_configs](https://www.robustperception.io/relabel_configs-vs-metric_relabel_configs)
+
+
+### Monitoring
+
+`vmagent` exports various metrics in Prometheus exposition format at `http://vmagent-host:8429/metrics` page. It is recommended setting up regular scraping of this page
+either via `vmagent` itself or via Prometheus, so the exported metrics could be analyzed later.
+
+`vmagent` also exports target statuses at `http://vmagent-host:8429/targets` page in plaintext format.
+
+
+### Troubleshooting
+
+* It is recommended increasing the maximum number of open files in the system (`ulimit -n`) when scraping big number of targets,
+  since `vmagent` establishes at least a single TCP connection per each target.
+
+* It is recommended increasing `-remoteWrite.queues` if `vmagent` collects more than 100K samples per second
+  and `vmagent_remotewrite_pending_data_bytes` metric exported by `vmagent` at `/metrics` page constantly grows.
+
+* `vmagent` buffers scraped data at `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`.
+  The directory can grow big when remote storage is unvailable during extended periods of time. If you don't want
+  sending all the data from the directory to remote storage, just stop `vmagent` and delete the directory.
+
+
+### How to build from sources
+
+It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - `vmagent` is located in `vmutils-*` archives there.
+
+
+#### Development build
+
+1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
+2. Run `make vmagent` from the root folder of the repository.
+   It builds `vmagent` binary and puts it into the `bin` folder.
+
+#### Production build
+
+1. [Install docker](https://docs.docker.com/install/).
+2. Run `make vmagent-prod` from the root folder of the repository.
+   It builds `vmagent-prod` binary and puts it into the `bin` folder.
+
+#### Building docker images
+
+Run `make package-vmagent`. It builds `victoriametrics/vmagent:<PKG_TAG>` docker image locally.
+`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
+The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmagent`.
--- a/app/vmagent/common/push_ctx.go
+++ b/app/vmagent/common/push_ctx.go
@@ -0,0 +1,70 @@
+package common
+
+import (
+	"runtime"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+)
+
+// PushCtx is a context used for populating WriteRequest.
+type PushCtx struct {
+	WriteRequest prompbmarshal.WriteRequest
+
+	// Labels contains flat list of all the labels used in WriteRequest.
+	Labels []prompbmarshal.Label
+
+	// Samples contains flat list of all the samples used in WriteRequest.
+	Samples []prompbmarshal.Sample
+}
+
+// Reset resets ctx.
+func (ctx *PushCtx) Reset() {
+	tss := ctx.WriteRequest.Timeseries
+	for i := range tss {
+		ts := &tss[i]
+		ts.Labels = nil
+		ts.Samples = nil
+	}
+	ctx.WriteRequest.Timeseries = ctx.WriteRequest.Timeseries[:0]
+
+	labels := ctx.Labels
+	for i := range labels {
+		label := &labels[i]
+		label.Name = ""
+		label.Value = ""
+	}
+	ctx.Labels = ctx.Labels[:0]
+
+	ctx.Samples = ctx.Samples[:0]
+}
+
+// GetPushCtx returns PushCtx from pool.
+//
+// Call PutPushCtx when the ctx is no longer needed.
+func GetPushCtx() *PushCtx {
+	select {
+	case ctx := <-pushCtxPoolCh:
+		return ctx
+	default:
+		if v := pushCtxPool.Get(); v != nil {
+			return v.(*PushCtx)
+		}
+		return &PushCtx{}
+	}
+}
+
+// PutPushCtx returns ctx to the pool.
+//
+// ctx mustn't be used after returning to the pool.
+func PutPushCtx(ctx *PushCtx) {
+	ctx.Reset()
+	select {
+	case pushCtxPoolCh <- ctx:
+	default:
+		pushCtxPool.Put(ctx)
+	}
+}
+
+var pushCtxPool sync.Pool
+var pushCtxPoolCh = make(chan *PushCtx, runtime.GOMAXPROCS(-1))
--- a/app/vmagent/deployment/Dockerfile
+++ b/app/vmagent/deployment/Dockerfile
@@ -0,0 +1,8 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vmagent-prod
+EXPOSE 8429
+ENTRYPOINT ["/vmagent-prod"]
--- a/app/vmagent/graphite/request_handler.go
+++ b/app/vmagent/graphite/request_handler.go
@@ -0,0 +1,65 @@
+package graphite
+
+import (
+	"io"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/graphite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="graphite"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="graphite"}`)
+)
+
+// InsertHandler processes remote write for graphite plaintext protocol.
+//
+// See https://graphite.readthedocs.io/en/latest/feeding-carbon.html#the-plaintext-protocol
+func InsertHandler(r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, insertRows)
+	})
+}
+
+func insertRows(rows []parser.Row) error {
+	ctx := common.GetPushCtx()
+	defer common.PutPushCtx(ctx)
+
+	tssDst := ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.Labels[:0]
+	samples := ctx.Samples[:0]
+	for i := range rows {
+		r := &rows[i]
+		labelsLen := len(labels)
+		labels = append(labels, prompbmarshal.Label{
+			Name:  "__name__",
+			Value: r.Metric,
+		})
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			labels = append(labels, prompbmarshal.Label{
+				Name:  tag.Key,
+				Value: tag.Value,
+			})
+		}
+		samples = append(samples, prompbmarshal.Sample{
+			Value:     r.Value,
+			Timestamp: r.Timestamp,
+		})
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: samples[len(samples)-1:],
+		})
+	}
+	ctx.WriteRequest.Timeseries = tssDst
+	ctx.Labels = labels
+	ctx.Samples = samples
+	remotewrite.Push(&ctx.WriteRequest)
+	rowsInserted.Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return nil
+}
--- a/app/vmagent/influx/request_handler.go
+++ b/app/vmagent/influx/request_handler.go
@@ -0,0 +1,167 @@
+package influx
+
+import (
+	"flag"
+	"io"
+	"net/http"
+	"runtime"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/influx"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol")
+	skipSingleField           = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field")
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="influx"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="influx"}`)
+)
+
+// InsertHandlerForReader processes remote write for influx line protocol.
+//
+// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
+func InsertHandlerForReader(r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, false, "", "", insertRows)
+	})
+}
+
+// InsertHandlerForHTTP processes remote write for influx line protocol.
+//
+// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
+func InsertHandlerForHTTP(req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		isGzipped := req.Header.Get("Content-Encoding") == "gzip"
+		q := req.URL.Query()
+		precision := q.Get("precision")
+		// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
+		db := q.Get("db")
+		return parser.ParseStream(req.Body, isGzipped, precision, db, insertRows)
+	})
+}
+
+func insertRows(db string, rows []parser.Row) error {
+	ctx := getPushCtx()
+	defer putPushCtx(ctx)
+
+	rowsTotal := 0
+	tssDst := ctx.ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.ctx.Labels[:0]
+	samples := ctx.ctx.Samples[:0]
+	commonLabels := ctx.commonLabels[:0]
+	buf := ctx.buf[:0]
+	for i := range rows {
+		r := &rows[i]
+		commonLabels = commonLabels[:0]
+		hasDBLabel := false
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			if tag.Key == "db" {
+				hasDBLabel = true
+			}
+			commonLabels = append(commonLabels, prompbmarshal.Label{
+				Name:  tag.Key,
+				Value: tag.Value,
+			})
+		}
+		if len(db) > 0 && !hasDBLabel {
+			commonLabels = append(commonLabels, prompbmarshal.Label{
+				Name:  "db",
+				Value: db,
+			})
+		}
+		ctx.metricGroupBuf = append(ctx.metricGroupBuf[:0], r.Measurement...)
+		skipFieldKey := len(r.Fields) == 1 && *skipSingleField
+		if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
+			ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
+		}
+		for j := range r.Fields {
+			f := &r.Fields[j]
+			bufLen := len(buf)
+			buf = append(buf, ctx.metricGroupBuf...)
+			if !skipFieldKey {
+				buf = append(buf, f.Key...)
+			}
+			metricGroup := bytesutil.ToUnsafeString(buf[bufLen:])
+			labelsLen := len(labels)
+			labels = append(labels, prompbmarshal.Label{
+				Name:  "__name__",
+				Value: metricGroup,
+			})
+			labels = append(labels, commonLabels...)
+			samples = append(samples, prompbmarshal.Sample{
+				Timestamp: r.Timestamp,
+				Value:     f.Value,
+			})
+			tssDst = append(tssDst, prompbmarshal.TimeSeries{
+				Labels:  labels[labelsLen:],
+				Samples: samples[len(samples)-1:],
+			})
+		}
+		rowsTotal += len(r.Fields)
+	}
+	ctx.buf = buf
+	ctx.ctx.WriteRequest.Timeseries = tssDst
+	ctx.ctx.Labels = labels
+	ctx.ctx.Samples = samples
+	ctx.commonLabels = commonLabels
+	remotewrite.Push(&ctx.ctx.WriteRequest)
+	rowsInserted.Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
+
+	return nil
+}
+
+type pushCtx struct {
+	ctx            common.PushCtx
+	commonLabels   []prompbmarshal.Label
+	metricGroupBuf []byte
+	buf            []byte
+}
+
+func (ctx *pushCtx) reset() {
+	ctx.ctx.Reset()
+
+	commonLabels := ctx.commonLabels
+	for i := range commonLabels {
+		label := &commonLabels[i]
+		label.Name = ""
+		label.Value = ""
+	}
+
+	ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
+	ctx.buf = ctx.buf[:0]
+}
+
+func getPushCtx() *pushCtx {
+	select {
+	case ctx := <-pushCtxPoolCh:
+		return ctx
+	default:
+		if v := pushCtxPool.Get(); v != nil {
+			return v.(*pushCtx)
+		}
+		return &pushCtx{}
+	}
+}
+
+func putPushCtx(ctx *pushCtx) {
+	ctx.reset()
+	select {
+	case pushCtxPoolCh <- ctx:
+	default:
+		pushCtxPool.Put(ctx)
+	}
+}
+
+var pushCtxPool sync.Pool
+var pushCtxPoolCh = make(chan *pushCtx, runtime.GOMAXPROCS(-1))
--- a/app/vmagent/main.go
+++ b/app/vmagent/main.go
@@ -0,0 +1,167 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/graphite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/influx"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdb"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/promremotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/vmimport"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
+	influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
+	opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
+	opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	httpListenAddr = flag.String("httpListenAddr", ":8429", "TCP address to listen for http connections. "+
+		"Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vmagent instances on the same server. "+
+		"Note that /targets and /metrics pages aren't available if -httpListenAddr=''")
+	influxListenAddr   = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty")
+	graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
+	opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
+		"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
+		"Usually :4242 must be set. Doesn't work if empty")
+	opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
+)
+
+var (
+	influxServer       *influxserver.Server
+	graphiteServer     *graphiteserver.Server
+	opentsdbServer     *opentsdbserver.Server
+	opentsdbhttpServer *opentsdbhttpserver.Server
+)
+
+func main() {
+	envflag.Parse()
+	buildinfo.Init()
+	logger.Init()
+	logger.Infof("starting vmagent at %q...", *httpListenAddr)
+	startTime := time.Now()
+	remotewrite.Init()
+	writeconcurrencylimiter.Init()
+	if len(*influxListenAddr) > 0 {
+		influxServer = influxserver.MustStart(*influxListenAddr, influx.InsertHandlerForReader)
+	}
+	if len(*graphiteListenAddr) > 0 {
+		graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, graphite.InsertHandler)
+	}
+	if len(*opentsdbListenAddr) > 0 {
+		opentsdbServer = opentsdbserver.MustStart(*opentsdbListenAddr, opentsdb.InsertHandler, opentsdbhttp.InsertHandler)
+	}
+	if len(*opentsdbHTTPListenAddr) > 0 {
+		opentsdbhttpServer = opentsdbhttpserver.MustStart(*opentsdbHTTPListenAddr, opentsdbhttp.InsertHandler)
+	}
+
+	promscrape.Init(remotewrite.Push)
+
+	if len(*httpListenAddr) > 0 {
+		go httpserver.Serve(*httpListenAddr, requestHandler)
+	}
+	logger.Infof("started vmagent in %.3f seconds", time.Since(startTime).Seconds())
+
+	sig := procutil.WaitForSigterm()
+	logger.Infof("received signal %s", sig)
+
+	startTime = time.Now()
+	if len(*httpListenAddr) > 0 {
+		logger.Infof("gracefully shutting down webservice at %q", *httpListenAddr)
+		if err := httpserver.Stop(*httpListenAddr); err != nil {
+			logger.Fatalf("cannot stop the webservice: %s", err)
+		}
+		logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds())
+	}
+
+	promscrape.Stop()
+
+	if len(*influxListenAddr) > 0 {
+		influxServer.MustStop()
+	}
+	if len(*graphiteListenAddr) > 0 {
+		graphiteServer.MustStop()
+	}
+	if len(*opentsdbListenAddr) > 0 {
+		opentsdbServer.MustStop()
+	}
+	if len(*opentsdbHTTPListenAddr) > 0 {
+		opentsdbhttpServer.MustStop()
+	}
+	remotewrite.Stop()
+
+	logger.Infof("successfully stopped vmagent in %.3f seconds", time.Since(startTime).Seconds())
+}
+
+func requestHandler(w http.ResponseWriter, r *http.Request) bool {
+	path := strings.Replace(r.URL.Path, "//", "/", -1)
+	switch path {
+	case "/api/v1/write":
+		prometheusWriteRequests.Inc()
+		if err := promremotewrite.InsertHandler(r); err != nil {
+			prometheusWriteErrors.Inc()
+			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
+			return true
+		}
+		w.WriteHeader(http.StatusNoContent)
+		return true
+	case "/api/v1/import":
+		vmimportRequests.Inc()
+		if err := vmimport.InsertHandler(r); err != nil {
+			vmimportErrors.Inc()
+			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
+			return true
+		}
+		w.WriteHeader(http.StatusNoContent)
+		return true
+	case "/write", "/api/v2/write":
+		influxWriteRequests.Inc()
+		if err := influx.InsertHandlerForHTTP(r); err != nil {
+			influxWriteErrors.Inc()
+			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
+			return true
+		}
+		w.WriteHeader(http.StatusNoContent)
+		return true
+	case "/query":
+		// Emulate fake response for influx query.
+		// This is required for TSBS benchmark.
+		influxQueryRequests.Inc()
+		fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
+		return true
+	case "/targets":
+		promscrapeTargetsRequests.Inc()
+		w.Header().Set("Content-Type", "text/plain")
+		promscrape.WriteHumanReadableTargetsStatus(w)
+		return true
+	}
+	return false
+}
+
+var (
+	prometheusWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/write", protocol="prometheus"}`)
+	prometheusWriteErrors   = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/write", protocol="prometheus"}`)
+
+	vmimportRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/import", protocol="vm"}`)
+	vmimportErrors   = metrics.NewCounter(`vmagent_http_request_errors_total{path="/api/v1/import", protocol="vm"}`)
+
+	influxWriteRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/write", protocol="influx"}`)
+	influxWriteErrors   = metrics.NewCounter(`vmagent_http_request_errors_total{path="/write", protocol="influx"}`)
+
+	influxQueryRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/query", protocol="influx"}`)
+
+	promscrapeTargetsRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/targets"}`)
+)
--- a/app/vmagent/opentsdb/request_handler.go
+++ b/app/vmagent/opentsdb/request_handler.go
@@ -0,0 +1,65 @@
+package opentsdb
+
+import (
+	"io"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdb"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="opentsdb"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="opentsdb"}`)
+)
+
+// InsertHandler processes remote write for OpenTSDB put protocol.
+//
+// See http://opentsdb.net/docs/build/html/api_telnet/put.html
+func InsertHandler(r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, insertRows)
+	})
+}
+
+func insertRows(rows []parser.Row) error {
+	ctx := common.GetPushCtx()
+	defer common.PutPushCtx(ctx)
+
+	tssDst := ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.Labels[:0]
+	samples := ctx.Samples[:0]
+	for i := range rows {
+		r := &rows[i]
+		labelsLen := len(labels)
+		labels = append(labels, prompbmarshal.Label{
+			Name:  "__name__",
+			Value: r.Metric,
+		})
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			labels = append(labels, prompbmarshal.Label{
+				Name:  tag.Key,
+				Value: tag.Value,
+			})
+		}
+		samples = append(samples, prompbmarshal.Sample{
+			Value:     r.Value,
+			Timestamp: r.Timestamp,
+		})
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: samples[len(samples)-1:],
+		})
+	}
+	ctx.WriteRequest.Timeseries = tssDst
+	ctx.Labels = labels
+	ctx.Samples = samples
+	remotewrite.Push(&ctx.WriteRequest)
+	rowsInserted.Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return nil
+}
--- a/app/vmagent/opentsdbhttp/request_handler.go
+++ b/app/vmagent/opentsdbhttp/request_handler.go
@@ -0,0 +1,64 @@
+package opentsdbhttp
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="opentsdbhttp"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="opentsdbhttp"}`)
+)
+
+// InsertHandler processes HTTP OpenTSDB put requests.
+// See http://opentsdb.net/docs/build/html/api_http/put.html
+func InsertHandler(req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(req, insertRows)
+	})
+}
+
+func insertRows(rows []parser.Row) error {
+	ctx := common.GetPushCtx()
+	defer common.PutPushCtx(ctx)
+
+	tssDst := ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.Labels[:0]
+	samples := ctx.Samples[:0]
+	for i := range rows {
+		r := &rows[i]
+		labelsLen := len(labels)
+		labels = append(labels, prompbmarshal.Label{
+			Name:  "__name__",
+			Value: r.Metric,
+		})
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			labels = append(labels, prompbmarshal.Label{
+				Name:  tag.Key,
+				Value: tag.Value,
+			})
+		}
+		samples = append(samples, prompbmarshal.Sample{
+			Value:     r.Value,
+			Timestamp: r.Timestamp,
+		})
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: samples[len(samples)-1:],
+		})
+	}
+	ctx.WriteRequest.Timeseries = tssDst
+	ctx.Labels = labels
+	ctx.Samples = samples
+	remotewrite.Push(&ctx.WriteRequest)
+	rowsInserted.Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return nil
+}
--- a/app/vmagent/promremotewrite/request_handler.go
+++ b/app/vmagent/promremotewrite/request_handler.go
@@ -0,0 +1,67 @@
+package promremotewrite
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/promremotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="promremotewrite"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="promremotewrite"}`)
+)
+
+// InsertHandler processes remote write for prometheus.
+func InsertHandler(req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(req, insertRows)
+	})
+}
+
+func insertRows(timeseries []prompb.TimeSeries) error {
+	ctx := common.GetPushCtx()
+	defer common.PutPushCtx(ctx)
+
+	rowsTotal := 0
+	tssDst := ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.Labels[:0]
+	samples := ctx.Samples[:0]
+	for i := range timeseries {
+		ts := &timeseries[i]
+		labelsLen := len(labels)
+		for i := range ts.Labels {
+			label := &ts.Labels[i]
+			labels = append(labels, prompbmarshal.Label{
+				Name:  bytesutil.ToUnsafeString(label.Name),
+				Value: bytesutil.ToUnsafeString(label.Value),
+			})
+		}
+		samplesLen := len(samples)
+		for i := range ts.Samples {
+			sample := &ts.Samples[i]
+			samples = append(samples, prompbmarshal.Sample{
+				Value:     sample.Value,
+				Timestamp: sample.Timestamp,
+			})
+		}
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: samples[samplesLen:],
+		})
+		rowsTotal += len(ts.Samples)
+	}
+	ctx.WriteRequest.Timeseries = tssDst
+	ctx.Labels = labels
+	ctx.Samples = samples
+	remotewrite.Push(&ctx.WriteRequest)
+	rowsInserted.Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
+	return nil
+}
--- a/app/vmagent/remotewrite/client.go
+++ b/app/vmagent/remotewrite/client.go
@@ -0,0 +1,267 @@
+package remotewrite
+
+import (
+	"crypto/tls"
+	"crypto/x509"
+	"encoding/base64"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
+	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/fasthttp"
+)
+
+var (
+	sendTimeout = flag.Duration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to -remoteWrite.url")
+
+	tlsInsecureSkipVerify = flag.Bool("remoteWrite.tlsInsecureSkipVerify", false, "Whether to skip tls verification when connecting to -remoteWrite.url")
+	tlsCertFile           = flag.String("remoteWrite.tlsCertFile", "", "Optional path to client-side TLS certificate file to use when connecting to -remoteWrite.url")
+	tlsKeyFile            = flag.String("remoteWrite.tlsKeyFile", "", "Optional path to client-side TLS certificate key to use when connecting to -remoteWrite.url")
+	tlsCAFile             = flag.String("remoteWrite.tlsCAFile", "", "Optional path to TLS CA file to use for verifying connections to -remoteWrite.url. "+
+		"By default system CA is used")
+
+	basicAuthUsername = flag.String("remoteWrite.basicAuth.username", "", "Optional basic auth username to use for -remoteWrite.url")
+	basicAuthPassword = flag.String("remoteWrite.basicAuth.password", "", "Optional basic auth password to use for -remoteWrite.url")
+	bearerToken       = flag.String("remoteWrite.bearerToken", "", "Optional bearer auth token to use for -remoteWrite.url")
+)
+
+type client struct {
+	urlLabelValue  string
+	remoteWriteURL string
+	host           string
+	requestURI     string
+	authHeader     string
+	fq             *persistentqueue.FastQueue
+	hc             *fasthttp.HostClient
+
+	requestDuration *metrics.Histogram
+	requestsOKCount *metrics.Counter
+	errorsCount     *metrics.Counter
+	retriesCount    *metrics.Counter
+
+	wg     sync.WaitGroup
+	stopCh chan struct{}
+}
+
+func newClient(remoteWriteURL, urlLabelValue string, fq *persistentqueue.FastQueue) *client {
+	authHeader := ""
+	if len(*basicAuthUsername) > 0 || len(*basicAuthPassword) > 0 {
+		// See https://en.wikipedia.org/wiki/Basic_access_authentication
+		token := *basicAuthUsername + ":" + *basicAuthPassword
+		token64 := base64.StdEncoding.EncodeToString([]byte(token))
+		authHeader = "Basic " + token64
+	}
+	if len(*bearerToken) > 0 {
+		if authHeader != "" {
+			logger.Panicf("FATAL: `-remoteWrite.bearerToken`=%q cannot be set when `-remoteWrite.basicAuth.*` flags are set", *bearerToken)
+		}
+		authHeader = "Bearer " + *bearerToken
+	}
+
+	readTimeout := *sendTimeout
+	if readTimeout <= 0 {
+		readTimeout = time.Minute
+	}
+	var u fasthttp.URI
+	u.Update(remoteWriteURL)
+	scheme := string(u.Scheme())
+	switch scheme {
+	case "http", "https":
+	default:
+		logger.Panicf("FATAL: unsupported scheme in -remoteWrite.url=%q: %q. It must be http or https", remoteWriteURL, scheme)
+	}
+	host := string(u.Host())
+	if len(host) == 0 {
+		logger.Panicf("FATAL: invalid -remoteWrite.url=%q: host cannot be empty. Make sure the url looks like `http://host:port/path`", remoteWriteURL)
+	}
+	requestURI := string(u.RequestURI())
+	isTLS := scheme == "https"
+	var tlsCfg *tls.Config
+	if isTLS {
+		var err error
+		tlsCfg, err = getTLSConfig()
+		if err != nil {
+			logger.Panicf("FATAL: cannot initialize TLS config: %s", err)
+		}
+	}
+	if !strings.Contains(host, ":") {
+		if isTLS {
+			host += ":443"
+		} else {
+			host += ":80"
+		}
+	}
+	maxConns := 2 * *queues
+	hc := &fasthttp.HostClient{
+		Addr:                host,
+		Name:                "vmagent",
+		Dial:                statDial,
+		DialDualStack:       netutil.TCP6Enabled(),
+		IsTLS:               isTLS,
+		TLSConfig:           tlsCfg,
+		MaxConns:            maxConns,
+		MaxIdleConnDuration: 10 * readTimeout,
+		ReadTimeout:         readTimeout,
+		WriteTimeout:        10 * time.Second,
+		MaxResponseBodySize: 1024 * 1024,
+	}
+	c := &client{
+		urlLabelValue:  urlLabelValue,
+		remoteWriteURL: remoteWriteURL,
+		host:           host,
+		requestURI:     requestURI,
+		authHeader:     authHeader,
+		fq:             fq,
+		hc:             hc,
+		stopCh:         make(chan struct{}),
+	}
+	c.requestDuration = metrics.NewHistogram(fmt.Sprintf(`vmagent_remotewrite_duration_seconds{url=%q}`, c.urlLabelValue))
+	c.requestsOKCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.urlLabelValue))
+	c.errorsCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_errors_total{url=%q}`, c.urlLabelValue))
+	c.retriesCount = metrics.NewCounter(fmt.Sprintf(`vmagent_remotewrite_retries_count_total{url=%q}`, c.urlLabelValue))
+	for i := 0; i < *queues; i++ {
+		c.wg.Add(1)
+		go func() {
+			defer c.wg.Done()
+			c.runWorker()
+		}()
+	}
+	logger.Infof("initialized client for -remoteWrite.url=%q", c.remoteWriteURL)
+	return c
+}
+
+func (c *client) MustStop() {
+	close(c.stopCh)
+	c.wg.Wait()
+	logger.Infof("stopped client for -remoteWrite.url=%q", c.remoteWriteURL)
+}
+
+func getTLSConfig() (*tls.Config, error) {
+	var tlsRootCA *x509.CertPool
+	var tlsCertificate *tls.Certificate
+	if *tlsCertFile != "" || *tlsKeyFile != "" {
+		cert, err := tls.LoadX509KeyPair(*tlsCertFile, *tlsKeyFile)
+		if err != nil {
+			return nil, fmt.Errorf("cannot load TLS certificate for -remoteWrite.tlsCertFile=%q and -remoteWrite.tlsKeyFile=%q: %s", *tlsCertFile, *tlsKeyFile, err)
+		}
+		tlsCertificate = &cert
+	}
+	if *tlsCAFile != "" {
+		data, err := ioutil.ReadFile(*tlsCAFile)
+		if err != nil {
+			return nil, fmt.Errorf("cannot read -remoteWrite.tlsCAFile=%q: %s", *tlsCAFile, err)
+		}
+		tlsRootCA = x509.NewCertPool()
+		if !tlsRootCA.AppendCertsFromPEM(data) {
+			return nil, fmt.Errorf("cannot parse data -remoteWrite.tlsCAFile=%q", *tlsCAFile)
+		}
+	}
+	tlsCfg := &tls.Config{
+		RootCAs:            tlsRootCA,
+		ClientSessionCache: tls.NewLRUClientSessionCache(0),
+	}
+	if tlsCertificate != nil {
+		tlsCfg.Certificates = []tls.Certificate{*tlsCertificate}
+	}
+	tlsCfg.InsecureSkipVerify = *tlsInsecureSkipVerify
+	return tlsCfg, nil
+}
+
+func (c *client) runWorker() {
+	var ok bool
+	var block []byte
+	ch := make(chan struct{})
+	for {
+		block, ok = c.fq.MustReadBlock(block[:0])
+		if !ok {
+			return
+		}
+		go func() {
+			c.sendBlock(block)
+			ch <- struct{}{}
+		}()
+		select {
+		case <-ch:
+			// The block has been sent successfully
+			continue
+		case <-c.stopCh:
+			// c must be stopped. Wait for a while in the hope the block will be sent.
+			graceDuration := 5 * time.Second
+			select {
+			case <-ch:
+				// The block has been sent successfully.
+			case <-time.After(graceDuration):
+				logger.Errorf("couldn't sent block with size %d bytes to %q in %.3f seconds during shutdown; dropping it",
+					len(block), c.remoteWriteURL, graceDuration.Seconds())
+			}
+			return
+		}
+	}
+}
+
+func (c *client) sendBlock(block []byte) {
+	req := fasthttp.AcquireRequest()
+	req.SetRequestURI(c.requestURI)
+	req.SetHost(c.host)
+	req.Header.SetMethod("POST")
+	req.Header.Add("Content-Type", "application/x-protobuf")
+	req.Header.Add("Content-Encoding", "snappy")
+	if c.authHeader != "" {
+		req.Header.Set("Authorization", c.authHeader)
+	}
+	req.SetBody(block)
+
+	retryDuration := time.Second
+	resp := fasthttp.AcquireResponse()
+
+again:
+	select {
+	case <-c.stopCh:
+		fasthttp.ReleaseRequest(req)
+		fasthttp.ReleaseResponse(resp)
+		return
+	default:
+	}
+
+	startTime := time.Now()
+	// There is no need in calling DoTimeout, since the timeout is set in c.hc.ReadTimeout.
+	err := c.hc.Do(req, resp)
+	c.requestDuration.UpdateDuration(startTime)
+	if err != nil {
+		c.errorsCount.Inc()
+		retryDuration *= 2
+		if retryDuration > time.Minute {
+			retryDuration = time.Minute
+		}
+		logger.Errorf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds",
+			len(block), c.remoteWriteURL, err, retryDuration.Seconds())
+		time.Sleep(retryDuration)
+		c.retriesCount.Inc()
+		goto again
+	}
+	statusCode := resp.StatusCode()
+	if statusCode/100 != 2 {
+		metrics.GetOrCreateCounter(fmt.Sprintf(`vmagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.urlLabelValue, statusCode)).Inc()
+		retryDuration *= 2
+		if retryDuration > time.Minute {
+			retryDuration = time.Minute
+		}
+		logger.Errorf("unexpected status code received after sending a block with size %d bytes to %q: %d; response body=%q; re-sending the block in %.3f seconds",
+			len(block), c.remoteWriteURL, statusCode, resp.Body(), retryDuration.Seconds())
+		time.Sleep(retryDuration)
+		c.retriesCount.Inc()
+		goto again
+	}
+	c.requestsOKCount.Inc()
+
+	// The block has been successfully sent to the remote storage.
+	fasthttp.ReleaseResponse(resp)
+	fasthttp.ReleaseRequest(req)
+}
--- a/app/vmagent/remotewrite/pendingseries.go
+++ b/app/vmagent/remotewrite/pendingseries.go
@@ -0,0 +1,199 @@
+package remotewrite
+
+import (
+	"flag"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	"github.com/VictoriaMetrics/metrics"
+	"github.com/golang/snappy"
+)
+
+var (
+	flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+
+		"Higher value reduces network bandwidth usage at the cost of delayed push of scraped data to remote storage")
+	maxUnpackedBlockSize = flag.Int("remoteWrite.maxBlockSize", 32*1024*1024, "The maximum size in bytes of unpacked request to send to remote storage. "+
+		"It shouldn't exceed -maxInsertRequestSize from VictoriaMetrics")
+)
+
+// the maximum number of rows to send per each block.
+const maxRowsPerBlock = 10000
+
+type pendingSeries struct {
+	mu sync.Mutex
+	wr writeRequest
+
+	stopCh            chan struct{}
+	periodicFlusherWG sync.WaitGroup
+}
+
+func newPendingSeries(pushBlock func(block []byte)) *pendingSeries {
+	var ps pendingSeries
+	ps.wr.pushBlock = pushBlock
+	ps.stopCh = make(chan struct{})
+	ps.periodicFlusherWG.Add(1)
+	go func() {
+		defer ps.periodicFlusherWG.Done()
+		ps.periodicFlusher()
+	}()
+	return &ps
+}
+
+func (ps *pendingSeries) MustStop() {
+	close(ps.stopCh)
+	ps.periodicFlusherWG.Wait()
+}
+
+func (ps *pendingSeries) Push(tss []prompbmarshal.TimeSeries) {
+	ps.mu.Lock()
+	ps.wr.push(tss)
+	ps.mu.Unlock()
+}
+
+func (ps *pendingSeries) periodicFlusher() {
+	ticker := time.NewTicker(*flushInterval)
+	defer ticker.Stop()
+	mustStop := false
+	for !mustStop {
+		select {
+		case <-ps.stopCh:
+			mustStop = true
+		case <-ticker.C:
+			if time.Since(ps.wr.lastFlushTime) < *flushInterval/2 {
+				continue
+			}
+		}
+		ps.mu.Lock()
+		ps.wr.flush()
+		ps.mu.Unlock()
+	}
+}
+
+type writeRequest struct {
+	wr            prompbmarshal.WriteRequest
+	pushBlock     func(block []byte)
+	lastFlushTime time.Time
+
+	tss []prompbmarshal.TimeSeries
+
+	labels  []prompbmarshal.Label
+	samples []prompbmarshal.Sample
+	buf     []byte
+}
+
+func (wr *writeRequest) reset() {
+	wr.wr.Timeseries = nil
+
+	for i := range wr.tss {
+		ts := &wr.tss[i]
+		ts.Labels = nil
+		ts.Samples = nil
+	}
+	wr.tss = wr.tss[:0]
+
+	for i := range wr.labels {
+		label := &wr.labels[i]
+		label.Name = ""
+		label.Value = ""
+	}
+	wr.labels = wr.labels[:0]
+
+	wr.samples = wr.samples[:0]
+	wr.buf = wr.buf[:0]
+}
+
+func (wr *writeRequest) flush() {
+	wr.wr.Timeseries = wr.tss
+	wr.lastFlushTime = time.Now()
+	pushWriteRequest(&wr.wr, wr.pushBlock)
+	wr.reset()
+}
+
+func (wr *writeRequest) push(src []prompbmarshal.TimeSeries) {
+	tssDst := wr.tss
+	for i := range src {
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{})
+		dst := &tssDst[len(tssDst)-1]
+		wr.copyTimeSeries(dst, &src[i])
+		if len(wr.tss) >= maxRowsPerBlock {
+			wr.flush()
+			tssDst = wr.tss
+		}
+	}
+	wr.tss = tssDst
+}
+
+func (wr *writeRequest) copyTimeSeries(dst, src *prompbmarshal.TimeSeries) {
+	labelsDst := wr.labels
+	labelsLen := len(wr.labels)
+	samplesDst := wr.samples
+	buf := wr.buf
+	for i := range src.Labels {
+		labelsDst = append(labelsDst, prompbmarshal.Label{})
+		dstLabel := &labelsDst[len(labelsDst)-1]
+		srcLabel := &src.Labels[i]
+
+		buf = append(buf, srcLabel.Name...)
+		dstLabel.Name = bytesutil.ToUnsafeString(buf[len(buf)-len(srcLabel.Name):])
+		buf = append(buf, srcLabel.Value...)
+		dstLabel.Value = bytesutil.ToUnsafeString(buf[len(buf)-len(srcLabel.Value):])
+	}
+	dst.Labels = labelsDst[labelsLen:]
+
+	samplesDst = append(samplesDst, prompbmarshal.Sample{})
+	dstSample := &samplesDst[len(samplesDst)-1]
+	if len(src.Samples) != 1 {
+		logger.Panicf("BUG: unexpected number of samples in time series; got %d; want 1", len(src.Samples))
+	}
+	*dstSample = src.Samples[0]
+	dst.Samples = samplesDst[len(samplesDst)-1:]
+
+	wr.samples = samplesDst
+	wr.labels = labelsDst
+	wr.buf = buf
+}
+
+func pushWriteRequest(wr *prompbmarshal.WriteRequest, pushBlock func(block []byte)) {
+	if len(wr.Timeseries) == 0 {
+		// Nothing to push
+		return
+	}
+	bb := writeRequestBufPool.Get()
+	bb.B = prompbmarshal.MarshalWriteRequest(bb.B[:0], wr)
+	if len(bb.B) <= *maxUnpackedBlockSize {
+		zb := snappyBufPool.Get()
+		zb.B = snappy.Encode(zb.B[:cap(zb.B)], bb.B)
+		writeRequestBufPool.Put(bb)
+		if len(zb.B) <= persistentqueue.MaxBlockSize {
+			pushBlock(zb.B)
+			blockSizeRows.Update(float64(len(wr.Timeseries)))
+			blockSizeBytes.Update(float64(len(zb.B)))
+			snappyBufPool.Put(zb)
+			return
+		}
+		snappyBufPool.Put(zb)
+	} else {
+		writeRequestBufPool.Put(bb)
+	}
+
+	// Too big block. Recursively split it into smaller parts.
+	timeseries := wr.Timeseries
+	n := len(timeseries) / 2
+	wr.Timeseries = timeseries[:n]
+	pushWriteRequest(wr, pushBlock)
+	wr.Timeseries = timeseries[n:]
+	pushWriteRequest(wr, pushBlock)
+	wr.Timeseries = timeseries
+}
+
+var (
+	blockSizeBytes = metrics.NewHistogram(`vmagent_remotewrite_block_size_bytes`)
+	blockSizeRows  = metrics.NewHistogram(`vmagent_remotewrite_block_size_rows`)
+)
+
+var writeRequestBufPool bytesutil.ByteBufferPool
+var snappyBufPool bytesutil.ByteBufferPool
--- a/app/vmagent/remotewrite/relabel.go
+++ b/app/vmagent/remotewrite/relabel.go
@@ -0,0 +1,108 @@
+package remotewrite
+
+import (
+	"flag"
+	"strings"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
+)
+
+var (
+	extraLabelsUnparsed = flagutil.NewArray("remoteWrite.label", "Optional label in the form 'name=value' to add to all the metrics before sending them to -remoteWrite.url. "+
+		"Pass multiple -remoteWrite.label flags in order to add multiple flags to metrics before sending them to remote storage")
+	relabelConfigPath = flag.String("remoteWrite.relabelConfig", "", "Optional path to file with relabel_config entries. These entries are applied to all the metrics "+
+		"before sending them to -remoteWrite.url. See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config for details")
+)
+
+var extraLabels []prompbmarshal.Label
+var prcs []promrelabel.ParsedRelabelConfig
+
+// initRelabel must be called after parsing command-line flags.
+func initRelabel() {
+	// Init extraLabels
+	for _, s := range *extraLabelsUnparsed {
+		n := strings.IndexByte(s, '=')
+		if n < 0 {
+			logger.Panicf("FATAL: missing '=' in `-remoteWrite.label`. It must contain label in the form `name=value`; got %q", s)
+		}
+		extraLabels = append(extraLabels, prompbmarshal.Label{
+			Name:  s[:n],
+			Value: s[n+1:],
+		})
+	}
+
+	// Init prcs
+	if len(*relabelConfigPath) > 0 {
+		var err error
+		prcs, err = promrelabel.LoadRelabelConfigs(*relabelConfigPath)
+		if err != nil {
+			logger.Panicf("FATAL: cannot load relabel configs from -remoteWrite.relabelConfig=%q: %s", *relabelConfigPath, err)
+		}
+	}
+}
+
+func resetRelabel() {
+	extraLabels = nil
+	prcs = nil
+}
+
+func (rctx *relabelCtx) applyRelabeling(wr *prompbmarshal.WriteRequest) {
+	if len(extraLabels) == 0 && len(prcs) == 0 {
+		// Nothing to change.
+		return
+	}
+	tss := wr.Timeseries
+	tssDst := tss[:0]
+	labels := rctx.labels[:0]
+	for i := range tss {
+		ts := &tss[i]
+		labelsLen := len(labels)
+		labels = append(labels, ts.Labels...)
+		// extraLabels must be added before applying relabeling according to https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
+		for j := range extraLabels {
+			extraLabel := &extraLabels[j]
+			tmp := promrelabel.GetLabelByName(labels[labelsLen:], extraLabel.Name)
+			if tmp != nil {
+				tmp.Value = extraLabel.Value
+			} else {
+				labels = append(labels, *extraLabel)
+			}
+		}
+		labels = promrelabel.ApplyRelabelConfigs(labels, labelsLen, prcs, true)
+		if len(labels) == labelsLen {
+			// Drop the current time series, since relabeling removed all the labels.
+			continue
+		}
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: ts.Samples,
+		})
+	}
+	rctx.labels = labels
+	wr.Timeseries = tssDst
+}
+
+type relabelCtx struct {
+	// pool for labels, which are used during the relabeling.
+	labels []prompbmarshal.Label
+}
+
+func (rctx *relabelCtx) reset() {
+	labels := rctx.labels
+	for i := range labels {
+		label := &labels[i]
+		label.Name = ""
+		label.Value = ""
+	}
+	rctx.labels = rctx.labels[:0]
+}
+
+var relabelCtxPool = &sync.Pool{
+	New: func() interface{} {
+		return &relabelCtx{}
+	},
+}
--- a/app/vmagent/remotewrite/remotewrite.go
+++ b/app/vmagent/remotewrite/remotewrite.go
@@ -0,0 +1,127 @@
+package remotewrite
+
+import (
+	"flag"
+	"fmt"
+	"sync/atomic"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	"github.com/VictoriaMetrics/metrics"
+	xxhash "github.com/cespare/xxhash/v2"
+)
+
+var (
+	remoteWriteURLs = flagutil.NewArray("remoteWrite.url", "Remote storage URL to write data to. It must support Prometheus remote_write API. "+
+		"It is recommended using VictoriaMetrics as remote storage. Example url: http://<victoriametrics-host>:8428/api/v1/write . "+
+		"Pass multiple -remoteWrite.url flags in order to write data concurrently to multiple remote storage systems")
+	tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vmagent-remotewrite-data", "Path to directory where temporary data for remote write component is stored")
+	queues      = flag.Int("remoteWrite.queues", 1, "The number of concurrent queues to each -remoteWrite.url. Set more queues if a single queue "+
+		"isn't enough for sending high volume of collected data to remote storage")
+	showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+
+		"It is hidden by default, since it can contain sensistive auth info")
+)
+
+// Init initializes remotewrite.
+//
+// It must be called after flag.Parse().
+//
+// Stop must be called for graceful shutdown.
+func Init() {
+	if len(*remoteWriteURLs) == 0 {
+		logger.Panicf("FATAL: at least one `-remoteWrite.url` must be set")
+	}
+
+	if !*showRemoteWriteURL {
+		// remoteWrite.url can contain authentication codes, so hide it at `/metrics` output.
+		httpserver.RegisterSecretFlag("remoteWrite.url")
+	}
+	initRelabel()
+
+	maxInmemoryBlocks := memory.Allowed() / len(*remoteWriteURLs) / maxRowsPerBlock / 100
+	if maxInmemoryBlocks > 200 {
+		// There is no much sense in keeping higher number of blocks in memory,
+		// since this means that the producer outperforms consumer and the queue
+		// will continue growing. It is better storing the queue to file.
+		maxInmemoryBlocks = 200
+	}
+	if maxInmemoryBlocks < 2 {
+		maxInmemoryBlocks = 2
+	}
+	for i, remoteWriteURL := range *remoteWriteURLs {
+		h := xxhash.Sum64([]byte(remoteWriteURL))
+		path := fmt.Sprintf("%s/persistent-queue/%016X", *tmpDataPath, h)
+		fq := persistentqueue.MustOpenFastQueue(path, remoteWriteURL, maxInmemoryBlocks)
+		urlLabelValue := fmt.Sprintf("secret-url-%d", i+1)
+		if *showRemoteWriteURL {
+			urlLabelValue = remoteWriteURL
+		}
+		_ = metrics.NewGauge(fmt.Sprintf(`vmagent_remotewrite_pending_data_bytes{url=%q, hash="%016X"}`, urlLabelValue, h), func() float64 {
+			return float64(fq.GetPendingBytes())
+		})
+		_ = metrics.NewGauge(fmt.Sprintf(`vmagent_remotewrite_pending_inmemory_blocks{url=%q}`, urlLabelValue), func() float64 {
+			return float64(fq.GetInmemoryQueueLen())
+		})
+		c := newClient(remoteWriteURL, urlLabelValue, fq)
+		fqs = append(fqs, fq)
+		cs = append(cs, c)
+	}
+
+	pss = make([]*pendingSeries, *queues)
+	for i := range pss {
+		pss[i] = newPendingSeries(pushBlockToPersistentQueues)
+	}
+}
+
+// Stop stops remotewrite.
+//
+// It is expected that nobody calls Push during and after the call to this func.
+func Stop() {
+	for _, ps := range pss {
+		ps.MustStop()
+	}
+
+	// Close all the persistent queues. This should unblock clients waiting in MustReadBlock.
+	for _, fq := range fqs {
+		fq.MustClose()
+	}
+	fqs = nil
+
+	// Stop all the clients
+	for _, c := range cs {
+		c.MustStop()
+	}
+	cs = nil
+
+	resetRelabel()
+}
+
+// Push sends wr to remote storage systems set via `-remoteWrite.url`.
+//
+// Each timeseries in wr.Timeseries must contain one sample.
+func Push(wr *prompbmarshal.WriteRequest) {
+	rctx := relabelCtxPool.Get().(*relabelCtx)
+	rctx.applyRelabeling(wr)
+
+	idx := atomic.AddUint64(&pssNextIdx, 1) % uint64(len(pss))
+	pss[idx].Push(wr.Timeseries)
+
+	rctx.reset()
+	relabelCtxPool.Put(rctx)
+}
+
+func pushBlockToPersistentQueues(block []byte) {
+	for _, fq := range fqs {
+		fq.MustWriteBlock(block)
+	}
+}
+
+var fqs []*persistentqueue.FastQueue
+var cs []*client
+
+var pssNextIdx uint64
+var pss []*pendingSeries
--- a/app/vmagent/remotewrite/statconn.go
+++ b/app/vmagent/remotewrite/statconn.go
@@ -0,0 +1,71 @@
+package remotewrite
+
+import (
+	"net"
+	"sync/atomic"
+
+	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/fasthttp"
+)
+
+func statDial(addr string) (net.Conn, error) {
+	conn, err := fasthttp.Dial(addr)
+	dialsTotal.Inc()
+	if err != nil {
+		dialErrors.Inc()
+		return nil, err
+	}
+	conns.Inc()
+	sc := &statConn{
+		Conn: conn,
+	}
+	return sc, nil
+}
+
+var (
+	dialsTotal = metrics.NewCounter(`vmagent_remotewrite_dials_total`)
+	dialErrors = metrics.NewCounter(`vmagent_remotewrite_dial_errors_total`)
+	conns      = metrics.NewCounter(`vmagent_remotewrite_conns`)
+)
+
+type statConn struct {
+	closed uint64
+	net.Conn
+}
+
+func (sc *statConn) Read(p []byte) (int, error) {
+	n, err := sc.Conn.Read(p)
+	connReadsTotal.Inc()
+	if err != nil {
+		connReadErrors.Inc()
+	}
+	connBytesRead.Add(n)
+	return n, err
+}
+
+func (sc *statConn) Write(p []byte) (int, error) {
+	n, err := sc.Conn.Write(p)
+	connWritesTotal.Inc()
+	if err != nil {
+		connWriteErrors.Inc()
+	}
+	connBytesWritten.Add(n)
+	return n, err
+}
+
+func (sc *statConn) Close() error {
+	err := sc.Conn.Close()
+	if atomic.AddUint64(&sc.closed, 1) == 1 {
+		conns.Dec()
+	}
+	return err
+}
+
+var (
+	connReadsTotal   = metrics.NewCounter(`vmagent_remotewrite_conn_reads_total`)
+	connWritesTotal  = metrics.NewCounter(`vmagent_remotewrite_conn_writes_total`)
+	connReadErrors   = metrics.NewCounter(`vmagent_remotewrite_conn_read_errors_total`)
+	connWriteErrors  = metrics.NewCounter(`vmagent_remotewrite_conn_write_errors_total`)
+	connBytesRead    = metrics.NewCounter(`vmagent_remotewrite_conn_bytes_read_total`)
+	connBytesWritten = metrics.NewCounter(`vmagent_remotewrite_conn_bytes_written_total`)
+)
--- a/app/vmagent/vmagent.png
+++ b/app/vmagent/vmagent.png
--- a/app/vmagent/vmimport/request_handler.go
+++ b/app/vmagent/vmimport/request_handler.go
@@ -0,0 +1,70 @@
+package vmimport
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmagent/remotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/vmimport"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = metrics.NewCounter(`vmagent_rows_inserted_total{type="vmimport"}`)
+	rowsPerInsert = metrics.NewHistogram(`vmagent_rows_per_insert{type="vmimport"}`)
+)
+
+// InsertHandler processes `/api/v1/import` request.
+//
+// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
+func InsertHandler(req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(req, insertRows)
+	})
+}
+
+func insertRows(rows []parser.Row) error {
+	ctx := common.GetPushCtx()
+	defer common.PutPushCtx(ctx)
+
+	rowsTotal := 0
+	tssDst := ctx.WriteRequest.Timeseries[:0]
+	labels := ctx.Labels[:0]
+	samples := ctx.Samples[:0]
+	for i := range rows {
+		r := &rows[i]
+		labelsLen := len(labels)
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			labels = append(labels, prompbmarshal.Label{
+				Name:  bytesutil.ToUnsafeString(tag.Key),
+				Value: bytesutil.ToUnsafeString(tag.Value),
+			})
+		}
+		values := r.Values
+		timestamps := r.Timestamps
+		_ = timestamps[len(values)-1]
+		samplesLen := len(samples)
+		for j, value := range values {
+			samples = append(samples, prompbmarshal.Sample{
+				Value:     value,
+				Timestamp: timestamps[j],
+			})
+		}
+		tssDst = append(tssDst, prompbmarshal.TimeSeries{
+			Labels:  labels[labelsLen:],
+			Samples: samples[samplesLen:],
+		})
+		rowsTotal += len(values)
+	}
+	ctx.WriteRequest.Timeseries = tssDst
+	ctx.Labels = labels
+	ctx.Samples = samples
+	remotewrite.Push(&ctx.WriteRequest)
+	rowsInserted.Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
+	return nil
+}
--- a/app/vmbackup/Makefile
+++ b/app/vmbackup/Makefile
@@ -0,0 +1,67 @@
+# All these commands must run from repository root.
+
+vmbackup:
+	APP_NAME=vmbackup $(MAKE) app-local
+
+vmbackup-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker
+
+vmbackup-pure-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-pure
+
+vmbackup-amd64-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-amd64
+
+vmbackup-arm-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-arm
+
+vmbackup-arm64-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-arm64
+
+vmbackup-ppc64le-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-ppc64le
+
+vmbackup-386-prod:
+	APP_NAME=vmbackup $(MAKE) app-via-docker-386
+
+package-vmbackup:
+	APP_NAME=vmbackup $(MAKE) package-via-docker
+
+package-vmbackup-pure:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-pure
+
+package-vmbackup-amd64:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-amd64
+
+package-vmbackup-arm:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-arm
+
+package-vmbackup-arm64:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-arm64
+
+package-vmbackup-ppc64le:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-ppc64le
+
+package-vmbackup-386:
+	APP_NAME=vmbackup $(MAKE) package-via-docker-386
+
+publish-vmbackup:
+	APP_NAME=vmbackup $(MAKE) publish-via-docker
+
+vmbackup-pure:
+	APP_NAME=vmbackup $(MAKE) app-local-pure
+
+vmbackup-amd64:
+	CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-amd64 ./app/vmbackup
+
+vmbackup-arm:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-arm ./app/vmbackup
+
+vmbackup-arm64:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-arm64 ./app/vmbackup
+
+vmbackup-ppc64le:
+	CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-ppc64le ./app/vmbackup
+
+vmbackup-386:
+	CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmbackup-386 ./app/vmbackup
--- a/app/vmbackup/README.md
+++ b/app/vmbackup/README.md
@@ -0,0 +1,178 @@
+## vmbackup
+
+`vmbackup` creates VictoriaMetrics data backups from [instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
+
+Supported storage systems for backups:
+
+* [GCS](https://cloud.google.com/storage/). Example: `gcs://<bucket>/<path/to/backup>`
+* [S3](https://aws.amazon.com/s3/). Example: `s3://<bucket>/<path/to/backup>`
+* Any S3-compatible storage such as [MinIO](https://github.com/minio/minio). See `-customS3Endpoint` command-line flag.
+* Local filesystem. Example: `fs://</absolute/path/to/backup>`
+
+Incremental backups and full backups are supported. Incremental backups are created automatically if the destination path already contains data from the previous backup.
+Full backups can be sped up with `-origin` pointing to already existing backup on the same remote storage. In this case `vmbackup` makes server-side copy for the shared
+data between the existing backup and new backup. This saves time and costs on data transfer.
+
+Backup process can be interrupted at any time. It is automatically resumed from the interruption point when restarting `vmbackup` with the same args.
+
+Backed up data can be restored with [vmrestore](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md).
+
+
+### Use cases
+
+#### Regular backups
+
+Regular backup can be performed with the following command:
+
+```
+vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/new/backup>
+```
+
+* `</path/to/victoria-metrics-data>` - path to VictoriaMetrics data pointed by `-storageDataPath` command-line flag in single-node VictoriaMetrics or in cluster `vmstorage`.
+  There is no need to stop VictoriaMetrics for creating backups, since they are performed from immutable [instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
+* `<local-snapshot>` is the snapshot to backup. See [how to create instant snapshots](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
+* `<bucket>` is already existing name for [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets).
+* `<path/to/new/backup>` is the destination path where new backup will be placed.
+
+
+#### Regular backups with server-side copy from existing backup
+
+If the destination GCS bucket already contains the previous backup at `-origin` path, then new backup can be sped up
+with the following command:
+
+```
+vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/new/backup> -origin=gcs://<bucket>/<path/to/existing/backup>
+```
+
+This saves time and network bandwidth costs by performing server-side copy for the shared data from the `-origin` to `-dst`.
+
+
+#### Incremental backups
+
+Incremental backups are performed if `-dst` points to already existing backup. In this case only new data is uploaded to remote storage.
+This saves time and network bandwidth costs when working with big backups:
+
+```
+vmbackup -storageDataPath=</path/to/victoria-metrics-data> -snapshotName=<local-snapshot> -dst=gcs://<bucket>/<path/to/existing/backup>
+```
+
+
+#### Smart backups
+
+Smart backups mean storing full daily backups into `YYYYMMDD` folders and creating incremental hourly backup into `latest` folder:
+
+* Run the following command every hour:
+
+```
+vmbackup -snapshotName=<latest-snapshot> -dst=gcs://<bucket>/latest
+```
+
+Where `<latest-snapshot>` is the latest [snapshot](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots).
+The command will upload only changed data to `gcs://<bucket>/latest`.
+
+* Run the following command once a day:
+
+```
+vmbackup -snapshotName=<daily-snapshot> -dst=gcs://<bucket>/<YYYYMMDD> -origin=gcs://<bucket>/latest
+```
+
+Where `<daily-snapshot>` is the snapshot for the last day `<YYYYMMDD>`.
+
+
+This apporach saves network bandwidth costs on hourly backups (since they are incremental) and allows recovering data from either the last hour (`latest` backup)
+or from any day (`YYYYMMDD` backups). Note that hourly backup shouldn't run when creating daily backup.
+
+Do not forget removing old snapshots and backups when they are no longer needed for saving storage costs.
+
+
+### How does it work?
+
+The backup algorithm is the following:
+
+1. Collect information about files in the `-snapshotName`, in the `-dst` and in the `-origin`.
+2. Determine files in `-dst`, which are missing in `-snapshotName`, and delete them. These are usually small files, which are already merged into bigger files in the snapshot.
+3. Determine files from `-snapshotName`, which are missing in `-dst`. These are usually small new files and bigger merged files.
+4. Determine files from step 3, which exist in the `-origin`, and perform server-side copy of these files from `-origin` to `-dst`.
+   This are usually the biggest and the oldest files, which are shared between backups.
+5. Upload the remaining files from setp 3 from `-snapshotName` to `-dst`.
+
+The algorithm splits source files into 100MB chunks in the backup. Each chunk is stored as a separate file in the backup.
+Such splitting minimizes the amounts of data to re-transfer after temporary errors.
+
+`vmbackup` relies on [instant snapshot](https://medium.com/@valyala/how-victoriametrics-makes-instant-snapshots-for-multi-terabyte-time-series-data-e1f3fb0e0282) properties:
+
+- All the files in the snapshot are immutable.
+- Old files are periodically merged into new files.
+- Smaller files have higher probability to be merged.
+- Consecutive snapshots share many identical files.
+
+These properties allow performing fast and cheap incremental backups and server-side copying from `-origin` paths.
+`vmbackup` can work improperly or slowly when these properties are violated.
+
+
+### Troubleshooting
+
+* If the backup is slow, then try setting higher value for `-concurrency` flag. This will increase the number of concurrent workers that upload data to backup storage.
+* If `vmbackup` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
+* If `vmbackup` has been interrupted due to temporary error, then just restart it with the same args. It will resume the backup process.
+
+
+### Advanced usage
+
+Run `vmbackup -help` in order to see all the available options:
+
+```
+  -concurrency int
+    	The number of concurrent workers. Higher concurrency may reduce backup duration (default 10)
+  -configFilePath string
+    	Path to file with S3 configs. Configs are loaded from default location if not set.
+    	See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
+  -configProfile string
+    	Profile name for S3 configs (default "default")
+  -credsFilePath string
+    	Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
+    	See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
+  -customS3Endpoint string
+    	Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
+  -dst string
+    	Where to put the backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
+    	-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded
+  -loggerLevel string
+    	Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
+  -maxBytesPerSecond int
+    	The maximum upload speed. There is no limit if it is set to 0
+  -memory.allowedPercent float
+    	Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
+  -origin string
+    	Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups
+  -snapshotName string
+    	Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots
+  -storageDataPath string
+    	Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage (default "victoria-metrics-data")
+  -version
+    	Show VictoriaMetrics version
+```
+
+
+### How to build from sources
+
+It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
+
+
+#### Development build
+
+1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
+2. Run `make vmbackup` from the root folder of the repository.
+   It builds `vmbackup` binary and puts it into the `bin` folder.
+
+#### Production build
+
+1. [Install docker](https://docs.docker.com/install/).
+2. Run `make vmbackup-prod` from the root folder of the repository.
+   It builds `vmbackup-prod` binary and puts it into the `bin` folder.
+
+#### Building docker images
+
+Run `make package-vmbackup`. It builds `victoriametrics/vmbackup:<PKG_TAG>` docker image locally.
+`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
+The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmbackup`.
--- a/app/vmbackup/deployment/Dockerfile
+++ b/app/vmbackup/deployment/Dockerfile
@@ -0,0 +1,7 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vmbackup-prod
+ENTRYPOINT ["/vmbackup-prod"]
--- a/app/vmbackup/main.go
+++ b/app/vmbackup/main.go
@@ -0,0 +1,115 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+var (
+	storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to VictoriaMetrics data. Must match -storageDataPath from VictoriaMetrics or vmstorage")
+	snapshotName    = flag.String("snapshotName", "", "Name for the snapshot to backup. See https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#how-to-work-with-snapshots")
+	dst             = flag.String("dst", "", "Where to put the backup on the remote storage. "+
+		"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir\n"+
+		"-dst can point to the previous backup. In this case incremental backup is performed, i.e. only changed data is uploaded")
+	origin            = flag.String("origin", "", "Optional origin directory on the remote storage with old backup for server-side copying when performing full backup. This speeds up full backups")
+	concurrency       = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce backup duration")
+	maxBytesPerSecond = flag.Int("maxBytesPerSecond", 0, "The maximum upload speed. There is no limit if it is set to 0")
+)
+
+func main() {
+	flag.Usage = usage
+	envflag.Parse()
+	buildinfo.Init()
+
+	srcFS, err := newSrcFS()
+	if err != nil {
+		logger.Fatalf("%s", err)
+	}
+	dstFS, err := newDstFS()
+	if err != nil {
+		logger.Fatalf("%s", err)
+	}
+	originFS, err := newOriginFS()
+	if err != nil {
+		logger.Fatalf("%s", err)
+	}
+	a := &actions.Backup{
+		Concurrency: *concurrency,
+		Src:         srcFS,
+		Dst:         dstFS,
+		Origin:      originFS,
+	}
+	if err := a.Run(); err != nil {
+		logger.Fatalf("cannot create backup: %s", err)
+	}
+}
+
+func usage() {
+	const s = `
+vmbackup performs backups for VictoriaMetrics data from instant snapshots to gcs, s3
+or local filesystem. Backed up data can be restored with vmrestore.
+
+See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md .
+`
+
+	f := flag.CommandLine.Output()
+	fmt.Fprintf(f, "%s\n", s)
+	flag.PrintDefaults()
+}
+
+func newSrcFS() (*fslocal.FS, error) {
+	if len(*snapshotName) == 0 {
+		return nil, fmt.Errorf("`-snapshotName` cannot be empty")
+	}
+	snapshotPath := *storageDataPath + "/snapshots/" + *snapshotName
+
+	// Verify the snapshot exists.
+	f, err := os.Open(snapshotPath)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open snapshot at %q: %s", snapshotPath, err)
+	}
+	fi, err := f.Stat()
+	_ = f.Close()
+	if err != nil {
+		return nil, fmt.Errorf("cannot stat %q: %s", snapshotPath, err)
+	}
+	if !fi.IsDir() {
+		return nil, fmt.Errorf("snapshot %q must be a directory", snapshotPath)
+	}
+
+	fs := &fslocal.FS{
+		Dir:               snapshotPath,
+		MaxBytesPerSecond: *maxBytesPerSecond,
+	}
+	if err := fs.Init(); err != nil {
+		return nil, fmt.Errorf("cannot initialize fs: %s", err)
+	}
+	return fs, nil
+}
+
+func newDstFS() (common.RemoteFS, error) {
+	fs, err := actions.NewRemoteFS(*dst)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse `-dst`=%q: %s", *dst, err)
+	}
+	return fs, nil
+}
+
+func newOriginFS() (common.RemoteFS, error) {
+	if len(*origin) == 0 {
+		return nil, nil
+	}
+	fs, err := actions.NewRemoteFS(*origin)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse `-origin`=%q: %s", *origin, err)
+	}
+	return fs, nil
+}
--- a/app/vminsert/Makefile
+++ b/app/vminsert/Makefile
@@ -0,0 +1,34 @@
+# All these commands must run from repository root.
+
+run-vminsert:
+	APP_NAME=vminsert ARGS='-storageNode=localhost:8400' $(MAKE) run-via-docker
+
+vminsert:
+	APP_NAME=vminsert $(MAKE) app-local
+
+vminsert-race:
+	APP_NAME=vminsert RACE=-race $(MAKE) app-local
+
+vminsert-prod:
+	APP_NAME=vminsert $(MAKE) app-via-docker
+
+vminsert-pure-prod:
+	APP_NAME=vminsert $(MAKE) app-via-docker-pure
+
+vminsert-prod-race:
+	APP_NAME=vminsert RACE=-race $(MAKE) app-via-docker
+
+vminsert-pure:
+	APP_NAME=vminsert $(MAKE) app-local-pure
+
+package-vminsert:
+	APP_NAME=vminsert $(MAKE) package-via-docker
+
+package-vminsert-race:
+	APP_NAME=vminsert RACE=-race $(MAKE) package-via-docker
+
+publish-vminsert:
+	APP_NAME=vminsert $(MAKE) publish-via-docker
+
+publish-vminsert-race:
+	APP_NAME=vminsert RACE=-race $(MAKE) publish-via-docker
--- a/app/vminsert/README.md
+++ b/app/vminsert/README.md
@@ -1 +1 @@
-`vminsert` routes the ingested data to `vmstorage`.
+`vminsert` routes the ingested data to `vmstorage` nodes.
--- a/app/vminsert/common/insert_ctx.go
+++ b/app/vminsert/common/insert_ctx.go
@@ -1,106 +0,0 @@
-package common
-
-import (
-	"fmt"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
-)
-
-// InsertCtx contains common bits for data points insertion.
-type InsertCtx struct {
-	Labels []prompb.Label
-
-	mrs            []storage.MetricRow
-	metricNamesBuf []byte
-}
-
-// Reset resets ctx for future fill with rowsLen rows.
-func (ctx *InsertCtx) Reset(rowsLen int) {
-	for _, label := range ctx.Labels {
-		label.Name = nil
-		label.Value = nil
-	}
-	ctx.Labels = ctx.Labels[:0]
-
-	for i := range ctx.mrs {
-		mr := &ctx.mrs[i]
-		mr.MetricNameRaw = nil
-	}
-	ctx.mrs = ctx.mrs[:0]
-
-	if n := rowsLen - cap(ctx.mrs); n > 0 {
-		ctx.mrs = append(ctx.mrs[:cap(ctx.mrs)], make([]storage.MetricRow, n)...)
-	}
-	ctx.mrs = ctx.mrs[:rowsLen]
-	ctx.metricNamesBuf = ctx.metricNamesBuf[:0]
-}
-
-func (ctx *InsertCtx) marshalMetricNameRaw(prefix []byte, labels []prompb.Label) []byte {
-	start := len(ctx.metricNamesBuf)
-	ctx.metricNamesBuf = append(ctx.metricNamesBuf, prefix...)
-	ctx.metricNamesBuf = storage.MarshalMetricNameRaw(ctx.metricNamesBuf, labels)
-	metricNameRaw := ctx.metricNamesBuf[start:]
-	return metricNameRaw[:len(metricNameRaw):len(metricNameRaw)]
-}
-
-// WriteDataPoint writes (timestamp, value) with the given prefix and lables into ctx buffer.
-func (ctx *InsertCtx) WriteDataPoint(prefix []byte, labels []prompb.Label, timestamp int64, value float64) {
-	metricNameRaw := ctx.marshalMetricNameRaw(prefix, labels)
-	ctx.addRow(metricNameRaw, timestamp, value)
-}
-
-// WriteDataPointExt writes (timestamp, value) with the given metricNameRaw and labels into ctx buffer.
-//
-// It returns metricNameRaw for the given labels if len(metricNameRaw) == 0.
-func (ctx *InsertCtx) WriteDataPointExt(metricNameRaw []byte, labels []prompb.Label, timestamp int64, value float64) []byte {
-	if len(metricNameRaw) == 0 {
-		metricNameRaw = ctx.marshalMetricNameRaw(nil, labels)
-	}
-	ctx.addRow(metricNameRaw, timestamp, value)
-	return metricNameRaw
-}
-
-func (ctx *InsertCtx) addRow(metricNameRaw []byte, timestamp int64, value float64) {
-	mrs := ctx.mrs
-	if cap(mrs) > len(mrs) {
-		mrs = mrs[:len(mrs)+1]
-	} else {
-		mrs = append(mrs, storage.MetricRow{})
-	}
-	mr := &mrs[len(mrs)-1]
-	ctx.mrs = mrs
-	mr.MetricNameRaw = metricNameRaw
-	mr.Timestamp = timestamp
-	mr.Value = value
-}
-
-// AddLabel adds (name, value) label to ctx.Labels.
-//
-// name and value must exist until ctx.Labels is used.
-func (ctx *InsertCtx) AddLabel(name, value string) {
-	labels := ctx.Labels
-	if cap(labels) > len(labels) {
-		labels = labels[:len(labels)+1]
-	} else {
-		labels = append(labels, prompb.Label{})
-	}
-	label := &labels[len(labels)-1]
-
-	// Do not copy name and value contents for performance reasons.
-	// This reduces GC overhead on the number of objects and allocations.
-	label.Name = bytesutil.ToUnsafeBytes(name)
-	label.Value = bytesutil.ToUnsafeBytes(value)
-
-	ctx.Labels = labels
-}
-
-// FlushBufs flushes buffered rows to the underlying storage.
-func (ctx *InsertCtx) FlushBufs() error {
-	if err := vmstorage.AddRows(ctx.mrs); err != nil {
-		return fmt.Errorf("cannot store metrics: %s", err)
-	}
-	return nil
-}
--- a/app/vminsert/concurrencylimiter/concurrencylimiter.go
+++ b/app/vminsert/concurrencylimiter/concurrencylimiter.go
@@ -1,34 +0,0 @@
-package concurrencylimiter
-
-import (
-	"fmt"
-	"runtime"
-	"time"
-)
-
-var (
-	// ch is the channel for limiting concurrent inserts.
-	// Put an item into it before performing an insert and remove
-	// the item after the insert is complete.
-	ch = make(chan struct{}, runtime.GOMAXPROCS(-1)*2)
-
-	// waitDuration is the amount of time to wait until at least a single
-	// concurrent insert out of cap(Ch) inserts is complete.
-	waitDuration = time.Second * 30
-)
-
-// Do calls f with the limited concurrency.
-func Do(f func() error) error {
-	// Limit the number of conurrent inserts in order to prevent from excess
-	// memory usage and CPU trashing.
-	t := time.NewTimer(waitDuration)
-	select {
-	case ch <- struct{}{}:
-		t.Stop()
-		err := f()
-		<-ch
-		return err
-	case <-t.C:
-		return fmt.Errorf("the server is overloaded with %d concurrent inserts; either increase the number of CPUs or reduce the load", cap(ch))
-	}
-}
--- a/app/vminsert/deployment/Dockerfile
+++ b/app/vminsert/deployment/Dockerfile
@@ -0,0 +1,8 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vminsert-prod
+EXPOSE 8480
+ENTRYPOINT ["/vminsert-prod"]
--- a/app/vminsert/graphite/request_handler.go
+++ b/app/vminsert/graphite/request_handler.go
@@ -1,166 +1,63 @@
 package graphite

 import (
-	"bytes"
-	"fmt"
 	"io"
-	"net"
-	"runtime"
-	"sync"
-	"time"

-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/graphite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
 	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/fastjson/fastfloat"
 )

-var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="graphite"}`)
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="graphite"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="graphite"}`)
+)

-// insertHandler processes remote write for graphite plaintext protocol.
+// InsertHandler processes remote write for graphite plaintext protocol.
 //
 // See https://graphite.readthedocs.io/en/latest/feeding-carbon.html#the-plaintext-protocol
-func insertHandler(r io.Reader) error {
-	return concurrencylimiter.Do(func() error {
-		return insertHandlerInternal(r)
+func InsertHandler(at *auth.Token, r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, func(rows []parser.Row) error {
+			return insertRows(at, rows)
+		})
 	})
 }

-func insertHandlerInternal(r io.Reader) error {
-	ctx := getPushCtx()
-	defer putPushCtx(ctx)
-	for ctx.Read(r) {
-		if err := ctx.InsertRows(); err != nil {
+func insertRows(at *auth.Token, rows []parser.Row) error {
+	ctx := netstorage.GetInsertCtx()
+	defer netstorage.PutInsertCtx(ctx)
+
+	ctx.Reset() // This line is required for initializing ctx internals.
+	atCopy := *at
+	for i := range rows {
+		r := &rows[i]
+		ctx.Labels = ctx.Labels[:0]
+		ctx.AddLabel("", r.Metric)
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			if atCopy.AccountID == 0 {
+				// Multi-tenancy support via custom tags.
+				// Do not allow overriding AccountID and ProjectID from atCopy for security reasons.
+				if tag.Key == "VictoriaMetrics_AccountID" {
+					atCopy.AccountID = uint32(fastfloat.ParseUint64BestEffort(tag.Value))
+				}
+				if atCopy.ProjectID == 0 && tag.Key == "VictoriaMetrics_ProjectID" {
+					atCopy.ProjectID = uint32(fastfloat.ParseUint64BestEffort(tag.Value))
+				}
+			}
+			ctx.AddLabel(tag.Key, tag.Value)
+		}
+		if err := ctx.WriteDataPoint(&atCopy, ctx.Labels, r.Timestamp, r.Value); err != nil {
 			return err
 		}
 	}
-	return ctx.Error()
+	// Assume that all the rows for a single connection belong to the same (AccountID, ProjectID).
+	rowsInserted.Get(&atCopy).Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return ctx.FlushBufs()
 }
-
-func (ctx *pushCtx) InsertRows() error {
-	rows := ctx.Rows.Rows
-	ic := &ctx.Common
-	ic.Reset(len(rows))
-	for i := range rows {
-		r := &rows[i]
-		ic.Labels = ic.Labels[:0]
-		ic.AddLabel("", r.Metric)
-		for j := range r.Tags {
-			tag := &r.Tags[j]
-			ic.AddLabel(tag.Key, tag.Value)
-		}
-		ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, r.Value)
-	}
-	rowsInserted.Add(len(rows))
-	return ic.FlushBufs()
-}
-
-const maxReadPacketSize = 4 * 1024 * 1024
-
-const flushTimeout = 3 * time.Second
-
-func (ctx *pushCtx) Read(r io.Reader) bool {
-	graphiteReadCalls.Inc()
-	if ctx.err != nil {
-		return false
-	}
-	if c, ok := r.(net.Conn); ok {
-		if err := c.SetReadDeadline(time.Now().Add(flushTimeout)); err != nil {
-			graphiteReadErrors.Inc()
-			ctx.err = fmt.Errorf("cannot set read deadline: %s", err)
-			return false
-		}
-	}
-	lr := io.LimitReader(r, maxReadPacketSize)
-	ctx.reqBuf.Reset()
-	ctx.reqBuf.B = append(ctx.reqBuf.B[:0], ctx.tailBuf...)
-	n, err := io.CopyBuffer(&ctx.reqBuf, lr, ctx.copyBuf[:])
-	if err != nil {
-		if ne, ok := err.(net.Error); ok && ne.Timeout() {
-			// Flush the read data on timeout and try reading again.
-		} else {
-			graphiteReadErrors.Inc()
-			ctx.err = fmt.Errorf("cannot read graphite plaintext protocol data: %s", err)
-			return false
-		}
-	} else if n < maxReadPacketSize {
-		// Mark the end of stream.
-		ctx.err = io.EOF
-	}
-
-	// Parse all the rows until the last newline in ctx.reqBuf.B
-	nn := bytes.LastIndexByte(ctx.reqBuf.B, '\n')
-	ctx.tailBuf = ctx.tailBuf[:0]
-	if nn >= 0 {
-		ctx.tailBuf = append(ctx.tailBuf[:0], ctx.reqBuf.B[nn+1:]...)
-		ctx.reqBuf.B = ctx.reqBuf.B[:nn]
-	}
-	if err = ctx.Rows.Unmarshal(bytesutil.ToUnsafeString(ctx.reqBuf.B)); err != nil {
-		graphiteUnmarshalErrors.Inc()
-		ctx.err = fmt.Errorf("cannot unmarshal graphite plaintext protocol data with size %d: %s", len(ctx.reqBuf.B), err)
-		return false
-	}
-
-	// Convert timestamps from seconds to milliseconds
-	for i := range ctx.Rows.Rows {
-		ctx.Rows.Rows[i].Timestamp *= 1e3
-	}
-	return true
-}
-
-type pushCtx struct {
-	Rows   Rows
-	Common common.InsertCtx
-
-	reqBuf  bytesutil.ByteBuffer
-	tailBuf []byte
-	copyBuf [16 * 1024]byte
-
-	err error
-}
-
-func (ctx *pushCtx) Error() error {
-	if ctx.err == io.EOF {
-		return nil
-	}
-	return ctx.err
-}
-
-func (ctx *pushCtx) reset() {
-	ctx.Rows.Reset()
-	ctx.Common.Reset(0)
-	ctx.reqBuf.Reset()
-	ctx.tailBuf = ctx.tailBuf[:0]
-
-	ctx.err = nil
-}
-
-var (
-	graphiteReadCalls       = metrics.NewCounter(`vm_read_calls_total{name="graphite"}`)
-	graphiteReadErrors      = metrics.NewCounter(`vm_read_errors_total{name="graphite"}`)
-	graphiteUnmarshalErrors = metrics.NewCounter(`vm_unmarshal_errors_total{name="graphite"}`)
-)
-
-func getPushCtx() *pushCtx {
-	select {
-	case ctx := <-pushCtxPoolCh:
-		return ctx
-	default:
-		if v := pushCtxPool.Get(); v != nil {
-			return v.(*pushCtx)
-		}
-		return &pushCtx{}
-	}
-}
-
-func putPushCtx(ctx *pushCtx) {
-	ctx.reset()
-	select {
-	case pushCtxPoolCh <- ctx:
-	default:
-		pushCtxPool.Put(ctx)
-	}
-}
-
-var pushCtxPool sync.Pool
-var pushCtxPoolCh = make(chan *pushCtx, runtime.GOMAXPROCS(-1))
--- a/app/vminsert/influx/parser_timing_test.go
+++ b/app/vminsert/influx/parser_timing_test.go
@@ -1,20 +0,0 @@
-package influx
-
-import (
-	"fmt"
-	"testing"
-)
-
-func BenchmarkRowsUnmarshal(b *testing.B) {
-	s := `cpu usage_user=1.23,usage_system=4.34,usage_iowait=0.1112 1234556768`
-	b.SetBytes(int64(len(s)))
-	b.ReportAllocs()
-	b.RunParallel(func(pb *testing.PB) {
-		var rows Rows
-		for pb.Next() {
-			if err := rows.Unmarshal(s); err != nil {
-				panic(fmt.Errorf("cannot unmarshal %q: %s", s, err))
-			}
-		}
-	})
-}
--- a/app/vminsert/influx/request_handler.go
+++ b/app/vminsert/influx/request_handler.go
@@ -1,222 +1,119 @@
 package influx

 import (
-	"bytes"
-	"compress/gzip"
-	"fmt"
+	"flag"
 	"io"
 	"net/http"
 	"runtime"
 	"sync"
-	"time"

-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/influx"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
 	"github.com/VictoriaMetrics/metrics"
 )

-var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="influx"}`)
+var (
+	measurementFieldSeparator = flag.String("influxMeasurementFieldSeparator", "_", "Separator for '{measurement}{separator}{field_name}' metric name when inserted via Influx line protocol")
+	skipSingleField           = flag.Bool("influxSkipSingleField", false, "Uses '{measurement}' instead of '{measurement}{separator}{field_name}' for metic name if Influx line contains only a single field")
+)

-// InsertHandler processes remote write for influx line protocol.
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="influx"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="influx"}`)
+)
+
+// InsertHandlerForReader processes remote write for influx line protocol.
 //
-// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
-func InsertHandler(req *http.Request) error {
-	return concurrencylimiter.Do(func() error {
-		return insertHandlerInternal(req)
+// See https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener/
+func InsertHandlerForReader(at *auth.Token, r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, false, "", "", func(db string, rows []parser.Row) error {
+			return insertRows(at, db, rows)
+		})
 	})
 }

-func insertHandlerInternal(req *http.Request) error {
-	influxReadCalls.Inc()
-
-	r := req.Body
-	if req.Header.Get("Content-Encoding") == "gzip" {
-		zr, err := getGzipReader(r)
-		if err != nil {
-			return fmt.Errorf("cannot read gzipped influx line protocol data: %s", err)
-		}
-		defer putGzipReader(zr)
-		r = zr
-	}
-
-	q := req.URL.Query()
-	tsMultiplier := int64(1e6)
-	switch q.Get("precision") {
-	case "ns":
-		tsMultiplier = 1e6
-	case "u":
-		tsMultiplier = 1e3
-	case "ms":
-		tsMultiplier = 1
-	case "s":
-		tsMultiplier = -1e3
-	case "m":
-		tsMultiplier = -1e3 * 60
-	case "h":
-		tsMultiplier = -1e3 * 3600
-	}
-
-	// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
-	db := q.Get("db")
-
-	ctx := getPushCtx()
-	defer putPushCtx(ctx)
-	for ctx.Read(r, tsMultiplier) {
-		if err := ctx.InsertRows(db); err != nil {
-			return err
-		}
-	}
-	return ctx.Error()
+// InsertHandlerForHTTP processes remote write for influx line protocol.
+//
+// See https://github.com/influxdata/influxdb/blob/4cbdc197b8117fee648d62e2e5be75c6575352f0/tsdb/README.md
+func InsertHandlerForHTTP(at *auth.Token, req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		isGzipped := req.Header.Get("Content-Encoding") == "gzip"
+		q := req.URL.Query()
+		precision := q.Get("precision")
+		// Read db tag from https://docs.influxdata.com/influxdb/v1.7/tools/api/#write-http-endpoint
+		db := q.Get("db")
+		return parser.ParseStream(req.Body, isGzipped, precision, db, func(db string, rows []parser.Row) error {
+			return insertRows(at, db, rows)
+		})
+	})
 }

-func (ctx *pushCtx) InsertRows(db string) error {
-	rows := ctx.Rows.Rows
-	rowsLen := 0
-	for i := range rows {
-		rowsLen += len(rows[i].Tags)
-	}
+func insertRows(at *auth.Token, db string, rows []parser.Row) error {
+	ctx := getPushCtx()
+	defer putPushCtx(ctx)
+
 	ic := &ctx.Common
-	ic.Reset(rowsLen)
+	ic.Reset() // This line is required for initializing ic internals.
+	rowsTotal := 0
 	for i := range rows {
 		r := &rows[i]
 		ic.Labels = ic.Labels[:0]
-		ic.AddLabel("db", db)
+		hasDBLabel := false
 		for j := range r.Tags {
 			tag := &r.Tags[j]
+			if tag.Key == "db" {
+				hasDBLabel = true
+			}
 			ic.AddLabel(tag.Key, tag.Value)
 		}
-		ctx.metricNameBuf = storage.MarshalMetricNameRaw(ctx.metricNameBuf[:0], ic.Labels)
+		if len(db) > 0 && !hasDBLabel {
+			ic.AddLabel("db", db)
+		}
+		ic.MetricNameBuf = storage.MarshalMetricNameRaw(ic.MetricNameBuf[:0], at.AccountID, at.ProjectID, ic.Labels)
+		metricNameBufLen := len(ic.MetricNameBuf)
 		ctx.metricGroupBuf = append(ctx.metricGroupBuf[:0], r.Measurement...)
-		ctx.metricGroupBuf = append(ctx.metricGroupBuf, '.')
+		skipFieldKey := len(r.Fields) == 1 && *skipSingleField
+		if len(ctx.metricGroupBuf) > 0 && !skipFieldKey {
+			ctx.metricGroupBuf = append(ctx.metricGroupBuf, *measurementFieldSeparator...)
+		}
 		metricGroupPrefixLen := len(ctx.metricGroupBuf)
+		ic.AddLabel("", "placeholder")
+		placeholderLabel := &ic.Labels[len(ic.Labels)-1]
 		for j := range r.Fields {
 			f := &r.Fields[j]
-			ctx.metricGroupBuf = append(ctx.metricGroupBuf[:metricGroupPrefixLen], f.Key...)
+			if !skipFieldKey {
+				ctx.metricGroupBuf = append(ctx.metricGroupBuf[:metricGroupPrefixLen], f.Key...)
+			}
 			metricGroup := bytesutil.ToUnsafeString(ctx.metricGroupBuf)
-			ic.Labels = ic.Labels[:0]
+			ic.Labels = ic.Labels[:len(ic.Labels)-1]
 			ic.AddLabel("", metricGroup)
-			ic.WriteDataPoint(ctx.metricNameBuf, ic.Labels[:1], r.Timestamp, f.Value)
+			ic.MetricNameBuf = storage.MarshalMetricLabelRaw(ic.MetricNameBuf[:metricNameBufLen], placeholderLabel)
+			storageNodeIdx := ic.GetStorageNodeIdx(at, ic.Labels)
+			if err := ic.WriteDataPointExt(at, storageNodeIdx, ic.MetricNameBuf, r.Timestamp, f.Value); err != nil {
+				return err
+			}
 		}
-		rowsInserted.Add(len(r.Fields))
+		rowsTotal += len(r.Fields)
 	}
+	rowsInserted.Get(at).Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
 	return ic.FlushBufs()
 }

-func getGzipReader(r io.Reader) (*gzip.Reader, error) {
-	v := gzipReaderPool.Get()
-	if v == nil {
-		return gzip.NewReader(r)
-	}
-	zr := v.(*gzip.Reader)
-	if err := zr.Reset(r); err != nil {
-		return nil, err
-	}
-	return zr, nil
-}
-
-func putGzipReader(zr *gzip.Reader) {
-	_ = zr.Close()
-	gzipReaderPool.Put(zr)
-}
-
-var gzipReaderPool sync.Pool
-
-const maxReadPacketSize = 4 * 1024 * 1024
-
-func (ctx *pushCtx) Read(r io.Reader, tsMultiplier int64) bool {
-	if ctx.err != nil {
-		return false
-	}
-	lr := io.LimitReader(r, maxReadPacketSize)
-	ctx.reqBuf.Reset()
-	ctx.reqBuf.B = append(ctx.reqBuf.B[:0], ctx.tailBuf...)
-	n, err := io.CopyBuffer(&ctx.reqBuf, lr, ctx.copyBuf[:])
-	if err != nil {
-		influxReadErrors.Inc()
-		ctx.err = fmt.Errorf("cannot read influx line protocol data: %s", err)
-		return false
-	}
-	if n < maxReadPacketSize {
-		// Mark the end of stream.
-		ctx.err = io.EOF
-	}
-
-	// Parse all the rows until the last newline in ctx.reqBuf.B
-	nn := bytes.LastIndexByte(ctx.reqBuf.B, '\n')
-	ctx.tailBuf = ctx.tailBuf[:0]
-	if nn >= 0 {
-		ctx.tailBuf = append(ctx.tailBuf[:0], ctx.reqBuf.B[nn+1:]...)
-		ctx.reqBuf.B = ctx.reqBuf.B[:nn]
-	}
-	if err = ctx.Rows.Unmarshal(bytesutil.ToUnsafeString(ctx.reqBuf.B)); err != nil {
-		influxUnmarshalErrors.Inc()
-		ctx.err = fmt.Errorf("cannot unmarshal influx line protocol data with size %d: %s", len(ctx.reqBuf.B), err)
-		return false
-	}
-
-	// Adjust timestamps according to tsMultiplier
-	currentTs := time.Now().UnixNano() / 1e6
-	if tsMultiplier >= 1 {
-		for i := range ctx.Rows.Rows {
-			row := &ctx.Rows.Rows[i]
-			if row.Timestamp == 0 {
-				row.Timestamp = currentTs
-			} else {
-				row.Timestamp /= tsMultiplier
-			}
-		}
-	} else if tsMultiplier < 0 {
-		tsMultiplier = -tsMultiplier
-		for i := range ctx.Rows.Rows {
-			row := &ctx.Rows.Rows[i]
-			if row.Timestamp == 0 {
-				row.Timestamp = currentTs
-			} else {
-				row.Timestamp *= tsMultiplier
-			}
-		}
-	}
-	return true
-}
-
-var (
-	influxReadCalls       = metrics.NewCounter(`vm_read_calls_total{name="influx"}`)
-	influxReadErrors      = metrics.NewCounter(`vm_read_errors_total{name="influx"}`)
-	influxUnmarshalErrors = metrics.NewCounter(`vm_unmarshal_errors_total{name="influx"}`)
-)
-
 type pushCtx struct {
-	Rows   Rows
-	Common common.InsertCtx
-
-	reqBuf         bytesutil.ByteBuffer
-	tailBuf        []byte
-	copyBuf        [16 * 1024]byte
-	metricNameBuf  []byte
+	Common         netstorage.InsertCtx
 	metricGroupBuf []byte
-
-	err error
-}
-
-func (ctx *pushCtx) Error() error {
-	if ctx.err == io.EOF {
-		return nil
-	}
-	return ctx.err
 }

 func (ctx *pushCtx) reset() {
-	ctx.Rows.Reset()
-	ctx.Common.Reset(0)
-
-	ctx.reqBuf.Reset()
-	ctx.tailBuf = ctx.tailBuf[:0]
-	ctx.metricNameBuf = ctx.metricNameBuf[:0]
+	ctx.Common.Reset()
 	ctx.metricGroupBuf = ctx.metricGroupBuf[:0]
-
-	ctx.err = nil
 }

 func getPushCtx() *pushCtx {
--- a/app/vminsert/main.go
+++ b/app/vminsert/main.go
@@ -1,69 +1,177 @@
-package vminsert
+package main

 import (
 	"flag"
 	"fmt"
+	"io"
 	"net/http"
-	"strings"
+	"time"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/graphite"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/influx"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/opentsdb"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/prometheus"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/promremotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/vmimport"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	graphiteserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/graphite"
+	influxserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/influx"
+	opentsdbserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdb"
+	opentsdbhttpserver "github.com/VictoriaMetrics/VictoriaMetrics/lib/ingestserver/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
 	"github.com/VictoriaMetrics/metrics"
 )

 var (
-	graphiteListenAddr   = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
-	opentsdbListenAddr   = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB put messages. Usually :4242 must be set. Doesn't work if empty")
-	maxInsertRequestSize = flag.Int("maxInsertRequestSize", 32*1024*1024, "The maximum size of a single insert request in bytes")
+	graphiteListenAddr = flag.String("graphiteListenAddr", "", "TCP and UDP address to listen for Graphite plaintext data. Usually :2003 must be set. Doesn't work if empty")
+	influxListenAddr   = flag.String("influxListenAddr", "", "TCP and UDP address to listen for Influx line protocol data. Usually :8189 must be set. Doesn't work if empty")
+	opentsdbListenAddr = flag.String("opentsdbListenAddr", "", "TCP and UDP address to listen for OpentTSDB metrics. "+
+		"Telnet put messages and HTTP /api/put messages are simultaneously served on TCP port. "+
+		"Usually :4242 must be set. Doesn't work if empty")
+	opentsdbHTTPListenAddr = flag.String("opentsdbHTTPListenAddr", "", "TCP address to listen for OpentTSDB HTTP put requests. Usually :4242 must be set. Doesn't work if empty")
+	httpListenAddr         = flag.String("httpListenAddr", ":8480", "Address to listen for http connections")
+	maxLabelsPerTimeseries = flag.Int("maxLabelsPerTimeseries", 30, "The maximum number of labels accepted per time series. Superflouos labels are dropped")
+	storageNodes           = flagutil.NewArray("storageNode", "Address of vmstorage nodes; usage: -storageNode=vmstorage-host1:8400 -storageNode=vmstorage-host2:8400")
 )

-// Init initializes vminsert.
-func Init() {
+var (
+	influxServer       *influxserver.Server
+	graphiteServer     *graphiteserver.Server
+	opentsdbServer     *opentsdbserver.Server
+	opentsdbhttpServer *opentsdbhttpserver.Server
+)
+
+func main() {
+	envflag.Parse()
+	buildinfo.Init()
+	logger.Init()
+
+	logger.Infof("initializing netstorage for storageNodes %s...", *storageNodes)
+	startTime := time.Now()
+	if len(*storageNodes) == 0 {
+		logger.Fatalf("missing -storageNode arg")
+	}
+	netstorage.InitStorageNodes(*storageNodes)
+	logger.Infof("successfully initialized netstorage in %.3f seconds", time.Since(startTime).Seconds())
+
+	storage.SetMaxLabelsPerTimeseries(*maxLabelsPerTimeseries)
+
+	writeconcurrencylimiter.Init()
+	if len(*influxListenAddr) > 0 {
+		influxServer = influxserver.MustStart(*influxListenAddr, func(r io.Reader) error {
+			var at auth.Token // TODO: properly initialize auth token
+			return influx.InsertHandlerForReader(&at, r)
+		})
+	}
 	if len(*graphiteListenAddr) > 0 {
-		go graphite.Serve(*graphiteListenAddr)
+		graphiteServer = graphiteserver.MustStart(*graphiteListenAddr, func(r io.Reader) error {
+			var at auth.Token // TODO: properly initialize auth token
+			return graphite.InsertHandler(&at, r)
+		})
 	}
 	if len(*opentsdbListenAddr) > 0 {
-		go opentsdb.Serve(*opentsdbListenAddr)
+		opentsdbServer = opentsdbserver.MustStart(*opentsdbListenAddr, func(r io.Reader) error {
+			var at auth.Token // TODO: properly initialize auth token
+			return opentsdb.InsertHandler(&at, r)
+		}, opentsdbhttp.InsertHandler)
+	}
+	if len(*opentsdbHTTPListenAddr) > 0 {
+		opentsdbhttpServer = opentsdbhttpserver.MustStart(*opentsdbHTTPListenAddr, opentsdbhttp.InsertHandler)
 	}
-}

-// Stop stops vminsert.
-func Stop() {
+	go func() {
+		httpserver.Serve(*httpListenAddr, requestHandler)
+	}()
+
+	sig := procutil.WaitForSigterm()
+	logger.Infof("service received signal %s", sig)
+
+	logger.Infof("gracefully shutting down the service at %q", *httpListenAddr)
+	startTime = time.Now()
+	if err := httpserver.Stop(*httpListenAddr); err != nil {
+		logger.Fatalf("cannot stop the service: %s", err)
+	}
+	logger.Infof("successfully shut down the service in %.3f seconds", time.Since(startTime).Seconds())
+
+	if len(*influxListenAddr) > 0 {
+		influxServer.MustStop()
+	}
 	if len(*graphiteListenAddr) > 0 {
-		graphite.Stop()
+		graphiteServer.MustStop()
 	}
 	if len(*opentsdbListenAddr) > 0 {
-		opentsdb.Stop()
+		opentsdbServer.MustStop()
 	}
+	if len(*opentsdbHTTPListenAddr) > 0 {
+		opentsdbhttpServer.MustStop()
+	}
+
+	logger.Infof("shutting down neststorage...")
+	startTime = time.Now()
+	netstorage.Stop()
+	logger.Infof("successfully stopped netstorage in %.3f seconds", time.Since(startTime).Seconds())
+
+	fs.MustStopDirRemover()
+
+	logger.Infof("the vminsert has been stopped")
 }

-// RequestHandler is a handler for Prometheus remote storage write API
-func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
-	path := strings.Replace(r.URL.Path, "//", "/", -1)
-	switch path {
-	case "/api/v1/write":
+func requestHandler(w http.ResponseWriter, r *http.Request) bool {
+	p, err := httpserver.ParsePath(r.URL.Path)
+	if err != nil {
+		httpserver.Errorf(w, "cannot parse path %q: %s", r.URL.Path, err)
+		return true
+	}
+	if p.Prefix != "insert" {
+		// This is not our link.
+		return false
+	}
+	at, err := auth.NewToken(p.AuthToken)
+	if err != nil {
+		httpserver.Errorf(w, "auth error: %s", err)
+		return true
+	}
+
+	switch p.Suffix {
+	case "prometheus/", "prometheus", "prometheus/api/v1/write":
 		prometheusWriteRequests.Inc()
-		if err := prometheus.InsertHandler(r, int64(*maxInsertRequestSize)); err != nil {
+		if err := promremotewrite.InsertHandler(at, r); err != nil {
 			prometheusWriteErrors.Inc()
 			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
 			return true
 		}
 		w.WriteHeader(http.StatusNoContent)
 		return true
-	case "/write", "/api/v2/write":
+	case "prometheus/api/v1/import":
+		vmimportRequests.Inc()
+		if err := vmimport.InsertHandler(at, r); err != nil {
+			vmimportErrors.Inc()
+			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
+			return true
+		}
+		w.WriteHeader(http.StatusNoContent)
+		return true
+	case "influx/write", "influx/api/v2/write":
 		influxWriteRequests.Inc()
-		if err := influx.InsertHandler(r); err != nil {
+		if err := influx.InsertHandlerForHTTP(at, r); err != nil {
 			influxWriteErrors.Inc()
 			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
 			return true
 		}
 		w.WriteHeader(http.StatusNoContent)
 		return true
-	case "/query":
-		// Emulate fake response for influx query
+	case "influx/query":
+		// Emulate fake response for influx query.
+		// This is required for TSBS benchmark.
 		influxQueryRequests.Inc()
 		fmt.Fprintf(w, `{"results":[{"series":[{"values":[]}]}]}`)
 		return true
@@ -74,11 +182,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 }

 var (
-	prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/write", protocol="prometheus"}`)
-	prometheusWriteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/write", protocol="prometheus"}`)
+	prometheusWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/prometheus/", protocol="prometheus"}`)
+	prometheusWriteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/insert/{}/prometheus/", protocol="prometheus"}`)

-	influxWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/write", protocol="influx"}`)
-	influxWriteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/write", protocol="influx"}`)
+	vmimportRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/prometheus/api/v1/import", protocol="vm"}`)
+	vmimportErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/insert/{}/prometheus/api/v1/import", protocol="vm"}`)

-	influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/query", protocol="influx"}`)
+	influxWriteRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/influx/", protocol="influx"}`)
+	influxWriteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/insert/{}/influx/", protocol="influx"}`)
+
+	influxQueryRequests = metrics.NewCounter(`vm_http_requests_total{path="/insert/{}/influx/query", protocol="influx"}`)
 )
--- a/app/vminsert/netstorage/insert_ctx.go
+++ b/app/vminsert/netstorage/insert_ctx.go
@@ -0,0 +1,194 @@
+package netstorage
+
+import (
+	"fmt"
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/consts"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	xxhash "github.com/cespare/xxhash/v2"
+	jump "github.com/lithammer/go-jump-consistent-hash"
+)
+
+// InsertCtx is a generic context for inserting data.
+//
+// InsertCtx.Reset must be called before the first usage.
+type InsertCtx struct {
+	Labels        []prompb.Label
+	MetricNameBuf []byte
+
+	bufRowss  []bufRows
+	labelsBuf []byte
+
+	resultCh chan error
+}
+
+type bufRows struct {
+	buf  []byte
+	rows int
+}
+
+func (br *bufRows) pushTo(sn *storageNode) error {
+	bufLen := len(br.buf)
+	err := sn.push(br.buf, br.rows)
+	br.buf = br.buf[:0]
+	br.rows = 0
+	if err != nil {
+		return &httpserver.ErrorWithStatusCode{
+			Err:        fmt.Errorf("cannot send %d bytes to storageNode %q: %s", bufLen, sn.dialer.Addr(), err),
+			StatusCode: http.StatusServiceUnavailable,
+		}
+	}
+	return nil
+}
+
+// Reset resets ctx.
+func (ctx *InsertCtx) Reset() {
+	for _, label := range ctx.Labels {
+		label.Name = nil
+		label.Value = nil
+	}
+	ctx.Labels = ctx.Labels[:0]
+	ctx.MetricNameBuf = ctx.MetricNameBuf[:0]
+
+	if ctx.bufRowss == nil {
+		ctx.bufRowss = make([]bufRows, len(storageNodes))
+	}
+	for i := range ctx.bufRowss {
+		br := &ctx.bufRowss[i]
+		br.buf = br.buf[:0]
+		br.rows = 0
+	}
+	ctx.labelsBuf = ctx.labelsBuf[:0]
+	if ctx.resultCh == nil {
+		ctx.resultCh = make(chan error, len(storageNodes))
+	} else if len(ctx.resultCh) > 0 {
+		logger.Panicf("BUG: ctx.resultCh must be empty on Reset; got %d items", len(ctx.resultCh))
+	}
+}
+
+// AddLabelBytes adds (name, value) label to ctx.Labels.
+//
+// name and value must exist until ctx.Labels is used.
+func (ctx *InsertCtx) AddLabelBytes(name, value []byte) {
+	labels := ctx.Labels
+	if cap(labels) > len(labels) {
+		labels = labels[:len(labels)+1]
+	} else {
+		labels = append(labels, prompb.Label{})
+	}
+	label := &labels[len(labels)-1]
+
+	// Do not copy name and value contents for performance reasons.
+	// This reduces GC overhead on the number of objects and allocations.
+	label.Name = name
+	label.Value = value
+
+	ctx.Labels = labels
+}
+
+// AddLabel adds (name, value) label to ctx.Labels.
+//
+// name and value must exist until ctx.Labels is used.
+func (ctx *InsertCtx) AddLabel(name, value string) {
+	labels := ctx.Labels
+	if cap(labels) > len(labels) {
+		labels = labels[:len(labels)+1]
+	} else {
+		labels = append(labels, prompb.Label{})
+	}
+	label := &labels[len(labels)-1]
+
+	// Do not copy name and value contents for performance reasons.
+	// This reduces GC overhead on the number of objects and allocations.
+	label.Name = bytesutil.ToUnsafeBytes(name)
+	label.Value = bytesutil.ToUnsafeBytes(value)
+
+	ctx.Labels = labels
+}
+
+// WriteDataPoint writes (timestamp, value) data point with the given at and labels to ctx buffer.
+func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, timestamp int64, value float64) error {
+	ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, labels)
+	storageNodeIdx := ctx.GetStorageNodeIdx(at, labels)
+	return ctx.WriteDataPointExt(at, storageNodeIdx, ctx.MetricNameBuf, timestamp, value)
+}
+
+// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx.
+func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
+	br := &ctx.bufRowss[storageNodeIdx]
+	sn := storageNodes[storageNodeIdx]
+	bufNew := storage.MarshalMetricRow(br.buf, metricNameRaw, timestamp, value)
+	if len(bufNew) >= consts.MaxInsertPacketSize {
+		// Send buf to storageNode, since it is too big.
+		if err := br.pushTo(sn); err != nil {
+			return err
+		}
+		br.buf = storage.MarshalMetricRow(bufNew[:0], metricNameRaw, timestamp, value)
+	} else {
+		br.buf = bufNew
+	}
+	br.rows++
+	return nil
+}
+
+// FlushBufs flushes ctx bufs to remote storage nodes.
+func (ctx *InsertCtx) FlushBufs() error {
+	// Send per-storageNode bufs in parallel.
+	resultCh := ctx.resultCh
+	resultChLen := 0
+	for i := range ctx.bufRowss {
+		br := &ctx.bufRowss[i]
+		if len(br.buf) == 0 {
+			continue
+		}
+		resultChLen++
+		go func(br *bufRows, sn *storageNode) {
+			resultCh <- br.pushTo(sn)
+		}(br, storageNodes[i])
+	}
+	var lastErr error
+	for i := 0; i < resultChLen; i++ {
+		err := <-resultCh
+		if err != nil {
+			lastErr = err
+		}
+	}
+	return lastErr
+}
+
+// GetStorageNodeIdx returns storage node index for the given at and labels.
+//
+// The returned index must be passed to WriteDataPoint.
+func (ctx *InsertCtx) GetStorageNodeIdx(at *auth.Token, labels []prompb.Label) int {
+	if len(storageNodes) == 1 {
+		// Fast path - only a single storage node.
+		return 0
+	}
+
+	buf := ctx.labelsBuf[:0]
+	buf = encoding.MarshalUint32(buf, at.AccountID)
+	buf = encoding.MarshalUint32(buf, at.ProjectID)
+	for i := range labels {
+		label := &labels[i]
+		buf = marshalBytesFast(buf, label.Name)
+		buf = marshalBytesFast(buf, label.Value)
+	}
+	h := xxhash.Sum64(buf)
+	ctx.labelsBuf = buf
+
+	idx := int(jump.Hash(h, int32(len(storageNodes))))
+	return idx
+}
+
+func marshalBytesFast(dst []byte, s []byte) []byte {
+	dst = encoding.MarshalUint16(dst, uint16(len(s)))
+	dst = append(dst, s...)
+	return dst
+}
--- a/app/vminsert/netstorage/insert_ctx_pool.go
+++ b/app/vminsert/netstorage/insert_ctx_pool.go
@@ -0,0 +1,36 @@
+package netstorage
+
+import (
+	"runtime"
+	"sync"
+)
+
+// GetInsertCtx returns InsertCtx from the pool.
+//
+// Call PutInsertCtx for returning it to the pool.
+func GetInsertCtx() *InsertCtx {
+	select {
+	case ctx := <-insertCtxPoolCh:
+		return ctx
+	default:
+		if v := insertCtxPool.Get(); v != nil {
+			return v.(*InsertCtx)
+		}
+		return &InsertCtx{}
+	}
+}
+
+// PutInsertCtx returns ctx to the pool.
+//
+// ctx cannot be used after the call.
+func PutInsertCtx(ctx *InsertCtx) {
+	ctx.Reset()
+	select {
+	case insertCtxPoolCh <- ctx:
+	default:
+		insertCtxPool.Put(ctx)
+	}
+}
+
+var insertCtxPool sync.Pool
+var insertCtxPoolCh = make(chan *InsertCtx, runtime.GOMAXPROCS(-1))
--- a/app/vminsert/netstorage/netstorage.go
+++ b/app/vminsert/netstorage/netstorage.go
@@ -0,0 +1,471 @@
+package netstorage
+
+import (
+	"flag"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/consts"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/metrics"
+	xxhash "github.com/cespare/xxhash/v2"
+)
+
+var disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
+
+// push pushes buf to sn.
+//
+// It falls back to sending data to another vmstorage node if sn is currently
+// unavailable.
+//
+// rows is the number of rows in the buf.
+func (sn *storageNode) push(buf []byte, rows int) error {
+	if len(buf) > consts.MaxInsertPacketSize {
+		logger.Panicf("BUG: len(buf)=%d cannot exceed %d", len(buf), consts.MaxInsertPacketSize)
+	}
+	sn.rowsPushed.Add(rows)
+
+	sn.mu.Lock()
+	defer sn.mu.Unlock()
+
+	if sn.broken {
+		// The vmstorage node is broken. Re-route buf to healthy vmstorage nodes.
+		if !addToReroutedBuf(buf, rows) {
+			rowsLostTotal.Add(rows)
+			return fmt.Errorf("%d rows dropped because of reroutedBuf overflows %d bytes", rows, reroutedBufMaxSize)
+		}
+		sn.rowsReroutedFromHere.Add(rows)
+		return nil
+	}
+
+	if len(sn.buf)+len(buf) <= consts.MaxInsertPacketSize {
+		// Fast path: the buf contents fits sn.buf.
+		sn.buf = append(sn.buf, buf...)
+		sn.rows += rows
+		return nil
+	}
+
+	// Slow path: the buf contents doesn't fit sn.buf.
+	// Flush sn.buf to vmstorage and then add buf to sn.buf.
+	if err := sn.flushBufLocked(); err != nil {
+		// Failed to flush or re-route sn.buf to vmstorage nodes.
+		// The sn.buf is already dropped by flushBufLocked.
+		// Drop buf too, since there is litte sense in trying to rescue it.
+		rowsLostTotal.Add(rows)
+		return err
+	}
+
+	// Successful flush.
+	sn.buf = append(sn.buf, buf...)
+	sn.rows += rows
+	return nil
+}
+
+func (sn *storageNode) sendReroutedRow(buf []byte) error {
+	sn.mu.Lock()
+	defer sn.mu.Unlock()
+
+	if sn.broken {
+		return errBrokenStorageNode
+	}
+	if len(sn.buf)+len(buf) > consts.MaxInsertPacketSize {
+		return fmt.Errorf("cannot put %d bytes into vmstorage buffer, since its size cannot exceed %d bytes", len(sn.buf)+len(buf), consts.MaxInsertPacketSize)
+	}
+	sn.buf = append(sn.buf, buf...)
+	sn.rows++
+	return nil
+}
+
+var errBrokenStorageNode = fmt.Errorf("the vmstorage node is temporarily broken")
+
+func (sn *storageNode) flushBufLocked() error {
+	err := sn.sendBufLocked(sn.buf)
+	if err == nil {
+		// Successful flush. Remove broken flag.
+		sn.broken = false
+		sn.rowsSent.Add(sn.rows)
+		sn.buf = sn.buf[:0]
+		sn.rows = 0
+		return nil
+	}
+
+	// Couldn't flush sn.buf to vmstorage. Mark sn as broken
+	// and try re-routing sn.buf to healthy vmstorage nodes.
+	sn.broken = true
+	if !addToReroutedBuf(sn.buf, sn.rows) {
+		// Preserve sn.buf when it cannot be sent to healthy nodes
+		// in the hope the error will disappear on the next call to flushBufLocked.
+		//
+		// This should fix https://github.com/VictoriaMetrics/VictoriaMetrics/issues/294 .
+		return err
+	}
+	sn.buf = sn.buf[:0]
+	sn.rows = 0
+	return err
+}
+
+func (sn *storageNode) sendBufLocked(buf []byte) error {
+	if len(buf) == 0 {
+		return nil
+	}
+	if sn.bc == nil {
+		if err := sn.dial(); err != nil {
+			return fmt.Errorf("cannot dial %q: %s", sn.dialer.Addr(), err)
+		}
+	}
+	timeoutSeconds := len(buf) / 3e5
+	if timeoutSeconds < 60 {
+		timeoutSeconds = 60
+	}
+	timeout := time.Duration(timeoutSeconds) * time.Second
+	deadline := time.Now().Add(timeout)
+	if err := sn.bc.SetWriteDeadline(deadline); err != nil {
+		sn.closeBrokenConn()
+		return fmt.Errorf("cannot set write deadline to %s: %s", deadline, err)
+	}
+	// sizeBuf guarantees that the rows batch will be either fully
+	// read or fully discarded on the vmstorage side.
+	// sizeBuf is used for read optimization in vmstorage.
+	sn.sizeBuf = encoding.MarshalUint64(sn.sizeBuf[:0], uint64(len(buf)))
+	if _, err := sn.bc.Write(sn.sizeBuf); err != nil {
+		sn.closeBrokenConn()
+		return fmt.Errorf("cannot write data size %d: %s", len(buf), err)
+	}
+	if _, err := sn.bc.Write(buf); err != nil {
+		sn.closeBrokenConn()
+		return fmt.Errorf("cannot write data with size %d: %s", len(buf), err)
+	}
+	if err := sn.bc.Flush(); err != nil {
+		sn.closeBrokenConn()
+		return fmt.Errorf("cannot flush data with size %d: %s", len(buf), err)
+	}
+	return nil
+}
+
+func (sn *storageNode) dial() error {
+	c, err := sn.dialer.Dial()
+	if err != nil {
+		sn.dialErrors.Inc()
+		return err
+	}
+	compressionLevel := 1
+	if *disableRPCCompression {
+		compressionLevel = 0
+	}
+	bc, err := handshake.VMInsertClient(c, compressionLevel)
+	if err != nil {
+		_ = c.Close()
+		sn.handshakeErrors.Inc()
+		return fmt.Errorf("handshake error: %s", err)
+	}
+	sn.bc = bc
+	return nil
+}
+
+func (sn *storageNode) closeBrokenConn() {
+	if sn.bc == nil {
+		return
+	}
+	_ = sn.bc.Close()
+	sn.bc = nil
+	sn.connectionErrors.Inc()
+}
+
+func (sn *storageNode) run(stopCh <-chan struct{}) {
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	mustStop := false
+	for !mustStop {
+		select {
+		case <-stopCh:
+			mustStop = true
+			// Make sure flushBufLocked is called last time before returning
+			// in order to send the remaining bits of data.
+		case <-ticker.C:
+		}
+
+		sn.mu.Lock()
+		if err := sn.flushBufLocked(); err != nil {
+			sn.closeBrokenConn()
+			logger.Errorf("cannot flush data to storageNode %q: %s", sn.dialer.Addr(), err)
+		}
+		sn.mu.Unlock()
+	}
+}
+
+func rerouteWorker(stopCh <-chan struct{}) {
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	var buf []byte
+	mustStop := false
+	for !mustStop {
+		select {
+		case <-stopCh:
+			mustStop = true
+			// Make sure spreadReroutedBufToStorageNodes is called last time before returning
+			// in order to reroute the remaining data to healthy vmstorage nodes.
+		case <-ticker.C:
+		}
+
+		var err error
+		buf, err = spreadReroutedBufToStorageNodes(buf[:0])
+		if err != nil {
+			rerouteErrors.Inc()
+			logger.Errorf("cannot reroute data among healthy vmstorage nodes: %s", err)
+		}
+	}
+}
+
+// storageNode is a client sending data to vmstorage node.
+type storageNode struct {
+	mu sync.Mutex
+
+	// Buffer with data that needs to be written to vmstorage node.
+	buf []byte
+
+	// The number of rows buf contains at the moment.
+	rows int
+
+	// Temporary buffer for encoding marshaled buf size.
+	sizeBuf []byte
+
+	// broken is set to true if the given vmstorage node is temporarily unhealthy.
+	// In this case the data is re-routed to the remaining healthy vmstorage nodes.
+	broken bool
+
+	dialer *netutil.TCPDialer
+
+	bc *handshake.BufferedConn
+
+	// The number of dial errors to vmstorage node.
+	dialErrors *metrics.Counter
+
+	// The number of handshake errors to vmstorage node.
+	handshakeErrors *metrics.Counter
+
+	// The number of connection errors to vmstorage node.
+	connectionErrors *metrics.Counter
+
+	// The number of rows pushed to storageNode with push method.
+	rowsPushed *metrics.Counter
+
+	// The number of rows sent to vmstorage node.
+	rowsSent *metrics.Counter
+
+	// The number of rows rerouted from the given vmstorage node
+	// to healthy nodes when the given node was unhealthy.
+	rowsReroutedFromHere *metrics.Counter
+
+	// The number of rows rerouted to the given vmstorage node
+	// from other nodes when they were unhealthy.
+	rowsReroutedToHere *metrics.Counter
+}
+
+// storageNodes contains a list of vmstorage node clients.
+var storageNodes []*storageNode
+
+var (
+	storageNodesWG  sync.WaitGroup
+	rerouteWorkerWG sync.WaitGroup
+)
+
+var (
+	storageNodesStopCh  = make(chan struct{})
+	rerouteWorkerStopCh = make(chan struct{})
+)
+
+// InitStorageNodes initializes vmstorage nodes' connections to the given addrs.
+func InitStorageNodes(addrs []string) {
+	if len(addrs) == 0 {
+		logger.Panicf("BUG: addrs must be non-empty")
+	}
+	if len(addrs) > 255 {
+		logger.Panicf("BUG: too much addresses: %d; max supported %d addresses", len(addrs), 255)
+	}
+
+	for _, addr := range addrs {
+		sn := &storageNode{
+			dialer: netutil.NewTCPDialer("vminsert", addr),
+
+			dialErrors:           metrics.NewCounter(fmt.Sprintf(`vm_rpc_dial_errors_total{name="vminsert", addr=%q}`, addr)),
+			handshakeErrors:      metrics.NewCounter(fmt.Sprintf(`vm_rpc_handshake_errors_total{name="vminsert", addr=%q}`, addr)),
+			connectionErrors:     metrics.NewCounter(fmt.Sprintf(`vm_rpc_connection_errors_total{name="vminsert", addr=%q}`, addr)),
+			rowsPushed:           metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_pushed_total{name="vminsert", addr=%q}`, addr)),
+			rowsSent:             metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_sent_total{name="vminsert", addr=%q}`, addr)),
+			rowsReroutedFromHere: metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_from_here_total{name="vminsert", addr=%q}`, addr)),
+			rowsReroutedToHere:   metrics.NewCounter(fmt.Sprintf(`vm_rpc_rows_rerouted_to_here_total{name="vminsert", addr=%q}`, addr)),
+		}
+		_ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_rows_pending{name="vminsert", addr=%q}`, addr), func() float64 {
+			sn.mu.Lock()
+			n := sn.rows
+			sn.mu.Unlock()
+			return float64(n)
+		})
+		_ = metrics.NewGauge(fmt.Sprintf(`vm_rpc_buf_pending_bytes{name="vminsert", addr=%q}`, addr), func() float64 {
+			sn.mu.Lock()
+			n := len(sn.buf)
+			sn.mu.Unlock()
+			return float64(n)
+		})
+		storageNodes = append(storageNodes, sn)
+		storageNodesWG.Add(1)
+		go func(addr string) {
+			sn.run(storageNodesStopCh)
+			storageNodesWG.Done()
+		}(addr)
+	}
+
+	reroutedBufMaxSize = memory.Allowed() / 16
+	rerouteWorkerWG.Add(1)
+	go func() {
+		rerouteWorker(rerouteWorkerStopCh)
+		rerouteWorkerWG.Done()
+	}()
+}
+
+// Stop gracefully stops netstorage.
+func Stop() {
+	close(rerouteWorkerStopCh)
+	rerouteWorkerWG.Wait()
+
+	close(storageNodesStopCh)
+	storageNodesWG.Wait()
+}
+
+func addToReroutedBuf(buf []byte, rows int) bool {
+	reroutedLock.Lock()
+	defer reroutedLock.Unlock()
+	if len(reroutedBuf)+len(buf) > reroutedBufMaxSize {
+		reroutedBufOverflows.Inc()
+		return false
+	}
+	reroutedBuf = append(reroutedBuf, buf...)
+	reroutedRows += rows
+	reroutesTotal.Inc()
+	return true
+}
+
+func spreadReroutedBufToStorageNodes(swapBuf []byte) ([]byte, error) {
+	healthyStorageNodes := getHealthyStorageNodes()
+	if len(healthyStorageNodes) == 0 {
+		// No more vmstorage nodes to write data to.
+		return swapBuf, fmt.Errorf("all the storage nodes are unhealthy")
+	}
+
+	reroutedLock.Lock()
+	reroutedBuf, swapBuf = swapBuf[:0], reroutedBuf
+	rows := reroutedRows
+	reroutedRows = 0
+	reroutedLock.Unlock()
+
+	if len(swapBuf) == 0 {
+		// Nothing to re-route.
+		return swapBuf, nil
+	}
+
+	var mr storage.MetricRow
+	src := swapBuf
+	rowsProcessed := 0
+	for len(src) > 0 {
+		tail, err := mr.Unmarshal(src)
+		if err != nil {
+			logger.Panicf("BUG: cannot unmarshal recently marshaled MetricRow: %s", err)
+		}
+		rowBuf := src[:len(src)-len(tail)]
+		src = tail
+
+		// Use non-consistent hashing instead of jump hash in order to re-route rows
+		// equally among healthy vmstorage nodes.
+		// This should spread the increased load among healthy vmstorage nodes.
+		h := xxhash.Sum64(mr.MetricNameRaw)
+		idx := h % uint64(len(healthyStorageNodes))
+		attempts := 0
+		for {
+			sn := healthyStorageNodes[idx]
+			err := sn.sendReroutedRow(rowBuf)
+			if err == nil {
+				sn.rowsReroutedToHere.Inc()
+				break
+			}
+
+			// Cannot send data to sn. Try sending to the next vmstorage node.
+			idx++
+			if idx >= uint64(len(healthyStorageNodes)) {
+				idx = 0
+			}
+			attempts++
+			if attempts < len(healthyStorageNodes) {
+				continue
+			}
+			// There are no healthy nodes.
+			// Try returning the remaining data to reroutedBuf if it has enough free space.
+			rowsRemaining := rows - rowsProcessed
+			recovered := false
+			reroutedLock.Lock()
+			if len(rowBuf)+len(tail)+len(reroutedBuf) <= reroutedBufMaxSize {
+				swapBuf = append(swapBuf[:0], rowBuf...)
+				swapBuf = append(swapBuf, tail...)
+				swapBuf = append(swapBuf, reroutedBuf...)
+				reroutedBuf, swapBuf = swapBuf, reroutedBuf[:0]
+				reroutedRows += rowsRemaining
+				recovered = true
+			}
+			reroutedLock.Unlock()
+			if recovered {
+				return swapBuf, nil
+			}
+			rowsLostTotal.Add(rowsRemaining)
+			return swapBuf, fmt.Errorf("all the %d vmstorage nodes are unavailable; lost %d rows; last error: %s", len(storageNodes), rowsRemaining, err)
+		}
+		rowsProcessed++
+	}
+	if rowsProcessed != rows {
+		logger.Panicf("BUG: unexpected number of rows processed; got %d; want %d", rowsProcessed, rows)
+	}
+	reroutedRowsProcessed.Add(rowsProcessed)
+	return swapBuf, nil
+}
+
+var (
+	reroutedLock       sync.Mutex
+	reroutedBuf        []byte
+	reroutedRows       int
+	reroutedBufMaxSize int
+
+	reroutedRowsProcessed = metrics.NewCounter(`vm_rpc_rerouted_rows_processed_total{name="vminsert"}`)
+	reroutedBufOverflows  = metrics.NewCounter(`vm_rpc_rerouted_buf_overflows_total{name="vminsert"}`)
+	reroutesTotal         = metrics.NewCounter(`vm_rpc_reroutes_total{name="vminsert"}`)
+	_                     = metrics.NewGauge(`vm_rpc_rerouted_rows_pending{name="vminsert"}`, func() float64 {
+		reroutedLock.Lock()
+		n := reroutedRows
+		reroutedLock.Unlock()
+		return float64(n)
+	})
+	_ = metrics.NewGauge(`vm_rpc_rerouted_buf_pending_bytes{name="vminsert"}`, func() float64 {
+		reroutedLock.Lock()
+		n := len(reroutedBuf)
+		reroutedLock.Unlock()
+		return float64(n)
+	})
+
+	rerouteErrors = metrics.NewCounter(`vm_rpc_reroute_errors_total{name="vminsert"}`)
+	rowsLostTotal = metrics.NewCounter(`vm_rpc_rows_lost_total{name="vminsert"}`)
+)
+
+func getHealthyStorageNodes() []*storageNode {
+	sns := make([]*storageNode, 0, len(storageNodes)-1)
+	for _, sn := range storageNodes {
+		sn.mu.Lock()
+		if !sn.broken {
+			sns = append(sns, sn)
+		}
+		sn.mu.Unlock()
+	}
+	return sns
+}
--- a/app/vminsert/opentsdb/parser_timing_test.go
+++ b/app/vminsert/opentsdb/parser_timing_test.go
@@ -1,24 +0,0 @@
-package opentsdb
-
-import (
-	"fmt"
-	"testing"
-)
-
-func BenchmarkRowsUnmarshal(b *testing.B) {
-	s := `cpu.usage_user 1234556768 1.23 a=b
-cpu.usage_system 1234556768 23.344 a=b
-cpu.usage_iowait 1234556769 3.3443 a=b
-cpu.usage_irq 1234556768 0.34432 a=b
-`
-	b.SetBytes(int64(len(s)))
-	b.ReportAllocs()
-	b.RunParallel(func(pb *testing.PB) {
-		var rows Rows
-		for pb.Next() {
-			if err := rows.Unmarshal(s); err != nil {
-				panic(fmt.Errorf("cannot unmarshal %q: %s", s, err))
-			}
-		}
-	})
-}
--- a/app/vminsert/opentsdb/request_handler.go
+++ b/app/vminsert/opentsdb/request_handler.go
@@ -1,166 +1,63 @@
 package opentsdb

 import (
-	"bytes"
-	"fmt"
 	"io"
-	"net"
-	"runtime"
-	"sync"
-	"time"

-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdb"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
 	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/fastjson/fastfloat"
 )

-var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="opentsdb"}`)
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="opentsdb"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="opentsdb"}`)
+)

-// insertHandler processes remote write for OpenTSDB put protocol.
+// InsertHandler processes remote write for OpenTSDB put protocol.
 //
 // See http://opentsdb.net/docs/build/html/api_telnet/put.html
-func insertHandler(r io.Reader) error {
-	return concurrencylimiter.Do(func() error {
-		return insertHandlerInternal(r)
+func InsertHandler(at *auth.Token, r io.Reader) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(r, func(rows []parser.Row) error {
+			return insertRows(at, rows)
+		})
 	})
 }

-func insertHandlerInternal(r io.Reader) error {
-	ctx := getPushCtx()
-	defer putPushCtx(ctx)
-	for ctx.Read(r) {
-		if err := ctx.InsertRows(); err != nil {
+func insertRows(at *auth.Token, rows []parser.Row) error {
+	ctx := netstorage.GetInsertCtx()
+	defer netstorage.PutInsertCtx(ctx)
+
+	ctx.Reset() // This line is required for initializing ctx internals.
+	atCopy := *at
+	for i := range rows {
+		r := &rows[i]
+		ctx.Labels = ctx.Labels[:0]
+		ctx.AddLabel("", r.Metric)
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			if atCopy.AccountID == 0 {
+				// Multi-tenancy support via custom tags.
+				// Do not allow overriding AccountID and ProjectID from atCopy for security reasons.
+				if tag.Key == "VictoriaMetrics_AccountID" {
+					atCopy.AccountID = uint32(fastfloat.ParseUint64BestEffort(tag.Value))
+				}
+				if atCopy.ProjectID == 0 && tag.Key == "VictoriaMetrics_ProjectID" {
+					atCopy.ProjectID = uint32(fastfloat.ParseUint64BestEffort(tag.Value))
+				}
+			}
+			ctx.AddLabel(tag.Key, tag.Value)
+		}
+		if err := ctx.WriteDataPoint(&atCopy, ctx.Labels, r.Timestamp, r.Value); err != nil {
 			return err
 		}
 	}
-	return ctx.Error()
+	// Assume that all the rows for a single connection belong to the same (AccountID, ProjectID).
+	rowsInserted.Get(&atCopy).Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return ctx.FlushBufs()
 }
-
-func (ctx *pushCtx) InsertRows() error {
-	rows := ctx.Rows.Rows
-	ic := &ctx.Common
-	ic.Reset(len(rows))
-	for i := range rows {
-		r := &rows[i]
-		ic.Labels = ic.Labels[:0]
-		ic.AddLabel("", r.Metric)
-		for j := range r.Tags {
-			tag := &r.Tags[j]
-			ic.AddLabel(tag.Key, tag.Value)
-		}
-		ic.WriteDataPoint(nil, ic.Labels, r.Timestamp, r.Value)
-	}
-	rowsInserted.Add(len(rows))
-	return ic.FlushBufs()
-}
-
-const maxReadPacketSize = 4 * 1024 * 1024
-
-const flushTimeout = 3 * time.Second
-
-func (ctx *pushCtx) Read(r io.Reader) bool {
-	opentsdbReadCalls.Inc()
-	if ctx.err != nil {
-		return false
-	}
-	if c, ok := r.(net.Conn); ok {
-		if err := c.SetReadDeadline(time.Now().Add(flushTimeout)); err != nil {
-			opentsdbReadErrors.Inc()
-			ctx.err = fmt.Errorf("cannot set read deadline: %s", err)
-			return false
-		}
-	}
-	lr := io.LimitReader(r, maxReadPacketSize)
-	ctx.reqBuf.Reset()
-	ctx.reqBuf.B = append(ctx.reqBuf.B[:0], ctx.tailBuf...)
-	n, err := io.CopyBuffer(&ctx.reqBuf, lr, ctx.copyBuf[:])
-	if err != nil {
-		if ne, ok := err.(net.Error); ok && ne.Timeout() {
-			// Flush the read data on timeout and try reading again.
-		} else {
-			opentsdbReadErrors.Inc()
-			ctx.err = fmt.Errorf("cannot read OpenTSDB put protocol data: %s", err)
-			return false
-		}
-	} else if n < maxReadPacketSize {
-		// Mark the end of stream.
-		ctx.err = io.EOF
-	}
-
-	// Parse all the rows until the last newline in ctx.reqBuf.B
-	nn := bytes.LastIndexByte(ctx.reqBuf.B, '\n')
-	ctx.tailBuf = ctx.tailBuf[:0]
-	if nn >= 0 {
-		ctx.tailBuf = append(ctx.tailBuf[:0], ctx.reqBuf.B[nn+1:]...)
-		ctx.reqBuf.B = ctx.reqBuf.B[:nn]
-	}
-	if err = ctx.Rows.Unmarshal(bytesutil.ToUnsafeString(ctx.reqBuf.B)); err != nil {
-		opentsdbUnmarshalErrors.Inc()
-		ctx.err = fmt.Errorf("cannot unmarshal OpenTSDB put protocol data with size %d: %s", len(ctx.reqBuf.B), err)
-		return false
-	}
-
-	// Convert timestamps from seconds to milliseconds
-	for i := range ctx.Rows.Rows {
-		ctx.Rows.Rows[i].Timestamp *= 1e3
-	}
-	return true
-}
-
-type pushCtx struct {
-	Rows   Rows
-	Common common.InsertCtx
-
-	reqBuf  bytesutil.ByteBuffer
-	tailBuf []byte
-	copyBuf [16 * 1024]byte
-
-	err error
-}
-
-func (ctx *pushCtx) Error() error {
-	if ctx.err == io.EOF {
-		return nil
-	}
-	return ctx.err
-}
-
-func (ctx *pushCtx) reset() {
-	ctx.Rows.Reset()
-	ctx.Common.Reset(0)
-	ctx.reqBuf.Reset()
-	ctx.tailBuf = ctx.tailBuf[:0]
-
-	ctx.err = nil
-}
-
-var (
-	opentsdbReadCalls       = metrics.NewCounter(`vm_read_calls_total{name="opentsdb"}`)
-	opentsdbReadErrors      = metrics.NewCounter(`vm_read_errors_total{name="opentsdb"}`)
-	opentsdbUnmarshalErrors = metrics.NewCounter(`vm_unmarshal_errors_total{name="opentsdb"}`)
-)
-
-func getPushCtx() *pushCtx {
-	select {
-	case ctx := <-pushCtxPoolCh:
-		return ctx
-	default:
-		if v := pushCtxPool.Get(); v != nil {
-			return v.(*pushCtx)
-		}
-		return &pushCtx{}
-	}
-}
-
-func putPushCtx(ctx *pushCtx) {
-	ctx.reset()
-	select {
-	case pushCtxPoolCh <- ctx:
-	default:
-		pushCtxPool.Put(ctx)
-	}
-}
-
-var pushCtxPool sync.Pool
-var pushCtxPoolCh = make(chan *pushCtx, runtime.GOMAXPROCS(-1))
--- a/app/vminsert/opentsdb/server.go
+++ b/app/vminsert/opentsdb/server.go
@@ -1,137 +0,0 @@
-package opentsdb
-
-import (
-	"net"
-	"runtime"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
-	"github.com/VictoriaMetrics/metrics"
-)
-
-var (
-	writeRequestsTCP = metrics.NewCounter(`vm_opentsdb_requests_total{name="write", net="tcp"}`)
-	writeErrorsTCP   = metrics.NewCounter(`vm_opentsdb_request_errors_total{name="write", net="tcp"}`)
-
-	writeRequestsUDP = metrics.NewCounter(`vm_opentsdb_requests_total{name="write", net="udp"}`)
-	writeErrorsUDP   = metrics.NewCounter(`vm_opentsdb_request_errors_total{name="write", net="udp"}`)
-)
-
-// Serve starts OpenTSDB collector on the given addr.
-func Serve(addr string) {
-	logger.Infof("starting TCP OpenTSDB collector at %q", addr)
-	lnTCP, err := net.Listen("tcp4", addr)
-	if err != nil {
-		logger.Fatalf("cannot start TCP OpenTSDB collector at %q: %s", addr, err)
-	}
-	listenerTCP = lnTCP
-
-	logger.Infof("starting UDP OpenTSDB collector at %q", addr)
-	lnUDP, err := net.ListenPacket("udp4", addr)
-	if err != nil {
-		logger.Fatalf("cannot start UDP OpenTSDB collector at %q: %s", addr, err)
-	}
-	listenerUDP = lnUDP
-
-	var wg sync.WaitGroup
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		serveTCP(listenerTCP)
-		logger.Infof("stopped TCP OpenTSDB collector at %q", addr)
-	}()
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		serveUDP(listenerUDP)
-		logger.Infof("stopped UDP OpenTSDB collector at %q", addr)
-	}()
-	wg.Wait()
-}
-
-func serveTCP(ln net.Listener) {
-	for {
-		c, err := ln.Accept()
-		if err != nil {
-			if ne, ok := err.(net.Error); ok {
-				if ne.Temporary() {
-					time.Sleep(time.Second)
-					continue
-				}
-				if strings.Contains(err.Error(), "use of closed network connection") {
-					break
-				}
-				logger.Fatalf("unrecoverable error when accepting TCP OpenTSDB connections: %s", err)
-			}
-			logger.Fatalf("unexpected error when accepting TCP OpenTSDB connections: %s", err)
-		}
-		go func() {
-			writeRequestsTCP.Inc()
-			if err := insertHandler(c); err != nil {
-				writeErrorsTCP.Inc()
-				logger.Errorf("error in TCP OpenTSDB conn %q<->%q: %s", c.LocalAddr(), c.RemoteAddr(), err)
-			}
-			_ = c.Close()
-		}()
-	}
-}
-
-func serveUDP(ln net.PacketConn) {
-	gomaxprocs := runtime.GOMAXPROCS(-1)
-	var wg sync.WaitGroup
-	for i := 0; i < gomaxprocs; i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			var bb bytesutil.ByteBuffer
-			bb.B = bytesutil.Resize(bb.B, 64*1024)
-			for {
-				bb.Reset()
-				bb.B = bb.B[:cap(bb.B)]
-				n, addr, err := ln.ReadFrom(bb.B)
-				if err != nil {
-					writeErrorsUDP.Inc()
-					if ne, ok := err.(net.Error); ok {
-						if ne.Temporary() {
-							time.Sleep(time.Second)
-							continue
-						}
-						if strings.Contains(err.Error(), "use of closed network connection") {
-							break
-						}
-					}
-					logger.Errorf("cannot read OpenTSDB UDP data: %s", err)
-					continue
-				}
-				bb.B = bb.B[:n]
-				writeRequestsUDP.Inc()
-				if err := insertHandler(bb.NewReader()); err != nil {
-					writeErrorsUDP.Inc()
-					logger.Errorf("error in UDP OpenTSDB conn %q<->%q: %s", ln.LocalAddr(), addr, err)
-					continue
-				}
-			}
-		}()
-	}
-	wg.Wait()
-}
-
-var (
-	listenerTCP net.Listener
-	listenerUDP net.PacketConn
-)
-
-// Stop stops the server.
-func Stop() {
-	logger.Infof("stopping TCP OpenTSDB server at %q...", listenerTCP.Addr())
-	if err := listenerTCP.Close(); err != nil {
-		logger.Errorf("cannot close TCP OpenTSDB server: %s", err)
-	}
-	logger.Infof("stopping UDP OpenTSDB server at %q...", listenerUDP.LocalAddr())
-	if err := listenerUDP.Close(); err != nil {
-		logger.Errorf("cannot close UDP OpenTSDB server: %s", err)
-	}
-}
--- a/app/vminsert/opentsdbhttp/request_handler.go
+++ b/app/vminsert/opentsdbhttp/request_handler.go
@@ -0,0 +1,69 @@
+package opentsdbhttp
+
+import (
+	"fmt"
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/opentsdbhttp"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="opentsdb-http"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="opentsdbhttp"}`)
+)
+
+// InsertHandler processes HTTP OpenTSDB put requests.
+// See http://opentsdb.net/docs/build/html/api_http/put.html
+func InsertHandler(req *http.Request) error {
+	path := req.URL.Path
+	p, err := httpserver.ParsePath(path)
+	if err != nil {
+		return fmt.Errorf("cannot parse path %q: %s", path, err)
+	}
+	if p.Prefix != "insert" {
+		// This is not our link.
+		return fmt.Errorf("unexpected path requested on HTTP OpenTSDB server: %q", path)
+	}
+	at, err := auth.NewToken(p.AuthToken)
+	if err != nil {
+		return fmt.Errorf("auth error: %s", err)
+	}
+	switch p.Suffix {
+	case "api/put", "opentsdb/api/put":
+		return writeconcurrencylimiter.Do(func() error {
+			return parser.ParseStream(req, func(rows []parser.Row) error {
+				return insertRows(at, rows)
+			})
+		})
+	default:
+		return fmt.Errorf("unexpected path requested on HTTP OpenTSDB server: %q", path)
+	}
+}
+
+func insertRows(at *auth.Token, rows []parser.Row) error {
+	ctx := netstorage.GetInsertCtx()
+	defer netstorage.PutInsertCtx(ctx)
+
+	ctx.Reset() // This line is required for initializing ctx internals.
+	for i := range rows {
+		r := &rows[i]
+		ctx.Labels = ctx.Labels[:0]
+		ctx.AddLabel("", r.Metric)
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			ctx.AddLabel(tag.Key, tag.Value)
+		}
+		if err := ctx.WriteDataPoint(at, ctx.Labels, r.Timestamp, r.Value); err != nil {
+			return err
+		}
+	}
+	rowsInserted.Get(at).Add(len(rows))
+	rowsPerInsert.Update(float64(len(rows)))
+	return ctx.FlushBufs()
+}
--- a/app/vminsert/prometheus/request_handler.go
+++ b/app/vminsert/prometheus/request_handler.go
@@ -1,106 +0,0 @@
-package prometheus
-
-import (
-	"fmt"
-	"net/http"
-	"runtime"
-	"sync"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/common"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/concurrencylimiter"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
-	"github.com/VictoriaMetrics/metrics"
-)
-
-var rowsInserted = metrics.NewCounter(`vm_rows_inserted_total{type="prometheus"}`)
-
-// InsertHandler processes remote write for prometheus.
-func InsertHandler(r *http.Request, maxSize int64) error {
-	return concurrencylimiter.Do(func() error {
-		return insertHandlerInternal(r, maxSize)
-	})
-}
-
-func insertHandlerInternal(r *http.Request, maxSize int64) error {
-	ctx := getPushCtx()
-	defer putPushCtx(ctx)
-	if err := ctx.Read(r, maxSize); err != nil {
-		return err
-	}
-	timeseries := ctx.req.Timeseries
-	rowsLen := 0
-	for i := range timeseries {
-		rowsLen += len(timeseries[i].Samples)
-	}
-	ic := &ctx.Common
-	ic.Reset(rowsLen)
-	for i := range timeseries {
-		ts := &timeseries[i]
-		var metricNameRaw []byte
-		for i := range ts.Samples {
-			r := &ts.Samples[i]
-			metricNameRaw = ic.WriteDataPointExt(metricNameRaw, ts.Labels, r.Timestamp, r.Value)
-		}
-		rowsInserted.Add(len(ts.Samples))
-	}
-	return ic.FlushBufs()
-}
-
-type pushCtx struct {
-	Common common.InsertCtx
-
-	req    prompb.WriteRequest
-	reqBuf []byte
-}
-
-func (ctx *pushCtx) reset() {
-	ctx.Common.Reset(0)
-	ctx.req.Reset()
-	ctx.reqBuf = ctx.reqBuf[:0]
-}
-
-func (ctx *pushCtx) Read(r *http.Request, maxSize int64) error {
-	prometheusReadCalls.Inc()
-
-	var err error
-	ctx.reqBuf, err = prompb.ReadSnappy(ctx.reqBuf[:0], r.Body, maxSize)
-	if err != nil {
-		prometheusReadErrors.Inc()
-		return fmt.Errorf("cannot read prompb.WriteRequest: %s", err)
-	}
-	if err = ctx.req.Unmarshal(ctx.reqBuf); err != nil {
-		prometheusUnmarshalErrors.Inc()
-		return fmt.Errorf("cannot unmarshal prompb.WriteRequest with size %d bytes: %s", len(ctx.reqBuf), err)
-	}
-	return nil
-}
-
-var (
-	prometheusReadCalls       = metrics.NewCounter(`vm_read_calls_total{name="prometheus"}`)
-	prometheusReadErrors      = metrics.NewCounter(`vm_read_errors_total{name="prometheus"}`)
-	prometheusUnmarshalErrors = metrics.NewCounter(`vm_unmarshal_errors_total{name="prometheus"}`)
-)
-
-func getPushCtx() *pushCtx {
-	select {
-	case ctx := <-pushCtxPoolCh:
-		return ctx
-	default:
-		if v := pushCtxPool.Get(); v != nil {
-			return v.(*pushCtx)
-		}
-		return &pushCtx{}
-	}
-}
-
-func putPushCtx(ctx *pushCtx) {
-	ctx.reset()
-	select {
-	case pushCtxPoolCh <- ctx:
-	default:
-		pushCtxPool.Put(ctx)
-	}
-}
-
-var pushCtxPool sync.Pool
-var pushCtxPoolCh = make(chan *pushCtx, runtime.GOMAXPROCS(-1))
--- a/app/vminsert/promremotewrite/request_handler.go
+++ b/app/vminsert/promremotewrite/request_handler.go
@@ -0,0 +1,54 @@
+package promremotewrite
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/promremotewrite"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="promremotewrite"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="promremotewrite"}`)
+)
+
+// InsertHandler processes remote write for prometheus.
+func InsertHandler(at *auth.Token, req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(req, func(timeseries []prompb.TimeSeries) error {
+			return insertRows(at, timeseries)
+		})
+	})
+}
+
+func insertRows(at *auth.Token, timeseries []prompb.TimeSeries) error {
+	ctx := netstorage.GetInsertCtx()
+	defer netstorage.PutInsertCtx(ctx)
+
+	ctx.Reset() // This line is required for initializing ctx internals.
+	rowsTotal := 0
+	for i := range timeseries {
+		ts := &timeseries[i]
+		storageNodeIdx := ctx.GetStorageNodeIdx(at, ts.Labels)
+		ctx.MetricNameBuf = ctx.MetricNameBuf[:0]
+		for i := range ts.Samples {
+			r := &ts.Samples[i]
+			if len(ctx.MetricNameBuf) == 0 {
+				ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, ts.Labels)
+			}
+			if err := ctx.WriteDataPointExt(at, storageNodeIdx, ctx.MetricNameBuf, r.Timestamp, r.Value); err != nil {
+				return err
+			}
+		}
+		rowsTotal += len(ts.Samples)
+	}
+	rowsInserted.Get(at).Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
+	return ctx.FlushBufs()
+}
--- a/app/vminsert/vmimport/request_handler.go
+++ b/app/vminsert/vmimport/request_handler.go
@@ -0,0 +1,60 @@
+package vmimport
+
+import (
+	"net/http"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/vmimport"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/tenantmetrics"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/writeconcurrencylimiter"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	rowsInserted  = tenantmetrics.NewCounterMap(`vm_rows_inserted_total{type="vmimport"}`)
+	rowsPerInsert = metrics.NewHistogram(`vm_rows_per_insert{type="vmimport"}`)
+)
+
+// InsertHandler processes `/api/v1/import` request.
+//
+// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6
+func InsertHandler(at *auth.Token, req *http.Request) error {
+	return writeconcurrencylimiter.Do(func() error {
+		return parser.ParseStream(req, func(rows []parser.Row) error {
+			return insertRows(at, rows)
+		})
+	})
+}
+
+func insertRows(at *auth.Token, rows []parser.Row) error {
+	ctx := netstorage.GetInsertCtx()
+	defer netstorage.PutInsertCtx(ctx)
+
+	ctx.Reset() // This line is required for initializing ctx internals.
+	rowsTotal := 0
+	for i := range rows {
+		r := &rows[i]
+		ctx.Labels = ctx.Labels[:0]
+		for j := range r.Tags {
+			tag := &r.Tags[j]
+			ctx.AddLabelBytes(tag.Key, tag.Value)
+		}
+		ctx.MetricNameBuf = storage.MarshalMetricNameRaw(ctx.MetricNameBuf[:0], at.AccountID, at.ProjectID, ctx.Labels)
+		storageNodeIdx := ctx.GetStorageNodeIdx(at, ctx.Labels)
+		values := r.Values
+		timestamps := r.Timestamps
+		_ = timestamps[len(values)-1]
+		for j, value := range values {
+			timestamp := timestamps[j]
+			if err := ctx.WriteDataPointExt(at, storageNodeIdx, ctx.MetricNameBuf, timestamp, value); err != nil {
+				return err
+			}
+		}
+		rowsTotal += len(values)
+	}
+	rowsInserted.Get(at).Add(rowsTotal)
+	rowsPerInsert.Update(float64(rowsTotal))
+	return ctx.FlushBufs()
+}
--- a/app/vmrestore/Makefile
+++ b/app/vmrestore/Makefile
@@ -0,0 +1,67 @@
+# All these commands must run from repository root.
+
+vmrestore:
+	APP_NAME=vmrestore $(MAKE) app-local
+
+vmrestore-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker
+
+vmrestore-pure-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-pure
+
+vmrestore-amd64-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-amd64
+
+vmrestore-arm-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-arm
+
+vmrestore-arm64-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-arm64
+
+vmrestore-ppc64le-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-ppc64le
+
+vmrestore-386-prod:
+	APP_NAME=vmrestore $(MAKE) app-via-docker-386
+
+package-vmrestore:
+	APP_NAME=vmrestore $(MAKE) package-via-docker
+
+package-vmrestore-pure:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-pure
+
+package-vmrestore-amd64:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-amd64
+
+package-vmrestore-arm:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-arm
+
+package-vmrestore-arm64:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-arm64
+
+package-vmrestore-ppc64le:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-ppc64le
+
+package-vmrestore-386:
+	APP_NAME=vmrestore $(MAKE) package-via-docker-386
+
+publish-vmrestore:
+	APP_NAME=vmrestore $(MAKE) publish-via-docker
+
+vmrestore-pure:
+	APP_NAME=vmrestore $(MAKE) app-local-pure
+
+vmrestore-amd64:
+	CGO_ENABLED=1 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmrestore-amd64 ./app/vmrestore
+
+vmrestore-arm:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmrestore-arm ./app/vmrestore
+
+vmrestore-arm64:
+	CGO_ENABLED=0 GOOS=linux GOARCH=arm64 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmrestore-arm64 ./app/vmrestore
+
+vmrestore-ppc64le:
+	CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmrestore-ppc64le ./app/vmrestore
+
+vmrestore-386:
+	CGO_ENABLED=0 GOOS=linux GOARCH=386 GO111MODULE=on go build -mod=vendor -ldflags "$(GO_BUILDINFO)" -o bin/vmrestore-386 ./app/vmrestore
--- a/app/vmrestore/README.md
+++ b/app/vmrestore/README.md
@@ -0,0 +1,86 @@
+## vmrestore
+
+`vmrestore` restores data from backups created by [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md).
+VictoriaMetrics `v1.29.0` and newer versions must be used for working with the restored data.
+
+Restore process can be interrupted at any time. It is automatically resumed from the inerruption point
+when restarting `vmrestore` with the same args.
+
+
+### Usage
+
+VictoriaMetrics must be stopped during the restore process.
+
+```
+vmrestore -src=gcs://<bucket>/<path/to/backup> -storageDataPath=<local/path/to/restore>
+
+```
+
+* `<bucket>` is [GCS bucket](https://cloud.google.com/storage/docs/creating-buckets) name.
+* `<path/to/backup>` is the path to backup made with [vmbackup](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmbackup/README.md) on GCS bucket.
+* `<local/path/to/restore>` is the path to folder where data will be restored. This folder must be passed
+  to VictoriaMetrics in `-storageDataPath` command-line flag after the restore process is complete.
+
+The original `-storageDataPath` directory may contain old files. They will be susbstituted by the files from backup.
+
+
+### Troubleshooting
+
+* If `vmrestore` eats all the network bandwidth, then set `-maxBytesPerSecond` to the desired value.
+* If `vmrestore` has been interrupted due to temporary error, then just restart it with the same args. It will resume the restore process.
+
+
+### Advanced usage
+
+Run `vmrestore -help` in order to see all the available options:
+
+```
+  -concurrency int
+    	The number of concurrent workers. Higher concurrency may reduce restore duration (default 10)
+  -configFilePath string
+    	Path to file with S3 configs. Configs are loaded from default location if not set.
+    	See https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
+  -configProfile string
+    	Profile name for S3 configs (default "default")
+  -credsFilePath string
+    	Path to file with GCS or S3 credentials. Credentials are loaded from default locations if not set.
+    	See https://cloud.google.com/iam/docs/creating-managing-service-account-keys and https://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html
+  -customS3Endpoint string
+    	Custom S3 endpoint for use with S3-compatible storages (e.g. MinIO). S3 is used if not set
+  -loggerLevel string
+    	Minimum level of errors to log. Possible values: INFO, ERROR, FATAL, PANIC (default "INFO")
+  -maxBytesPerSecond int
+    	The maximum download speed. There is no limit if it is set to 0
+  -memory.allowedPercent float
+    	Allowed percent of system memory VictoriaMetrics caches may occupy (default 60)
+  -src string
+    	Source path with backup on the remote storage. Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir
+  -storageDataPath string
+    	Destination path where backup must be restored. VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup (default "victoria-metrics-data")
+  -version
+    	Show VictoriaMetrics version
+```
+
+
+### How to build from sources
+
+It is recommended using [binary releases](https://github.com/VictoriaMetrics/VictoriaMetrics/releases) - see `vmutils-*` archives there.
+
+
+#### Development build
+
+1. [Install Go](https://golang.org/doc/install). The minimum supported version is Go 1.12.
+2. Run `make vmrestore` from the root folder of the repository.
+   It builds `vmrestore` binary and puts it into the `bin` folder.
+
+#### Production build
+
+1. [Install docker](https://docs.docker.com/install/).
+2. Run `make vmrestore-prod` from the root folder of the repository.
+   It builds `vmrestore-prod` binary and puts it into the `bin` folder.
+
+#### Building docker images
+
+Run `make package-vmrestore`. It builds `victoriametrics/vmrestore:<PKG_TAG>` docker image locally.
+`<PKG_TAG>` is auto-generated image tag, which depends on source code in the repository.
+The `<PKG_TAG>` may be manually set via `PKG_TAG=foobar make package-vmrestore`.
--- a/app/vmrestore/deployment/Dockerfile
+++ b/app/vmrestore/deployment/Dockerfile
@@ -0,0 +1,7 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vmrestore-prod
+ENTRYPOINT ["/vmrestore-prod"]
--- a/app/vmrestore/main.go
+++ b/app/vmrestore/main.go
@@ -0,0 +1,81 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/actions"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/common"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/backup/fslocal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+var (
+	src = flag.String("src", "", "Source path with backup on the remote storage. "+
+		"Example: gcs://bucket/path/to/backup/dir, s3://bucket/path/to/backup/dir or fs:///path/to/local/backup/dir")
+	storageDataPath = flag.String("storageDataPath", "victoria-metrics-data", "Destination path where backup must be restored. "+
+		"VictoriaMetrics must be stopped when restoring from backup. -storageDataPath dir can be non-empty. In this case only missing data is downloaded from backup")
+	concurrency             = flag.Int("concurrency", 10, "The number of concurrent workers. Higher concurrency may reduce restore duration")
+	maxBytesPerSecond       = flag.Int("maxBytesPerSecond", 0, "The maximum download speed. There is no limit if it is set to 0")
+	skipBackupCompleteCheck = flag.Bool("skipBackupCompleteCheck", false, "Whether to skip checking for 'backup complete' file in -src. This may be useful for restoring from old backups, which were created without 'backup complete' file")
+)
+
+func main() {
+	flag.Usage = usage
+	envflag.Parse()
+	buildinfo.Init()
+
+	srcFS, err := newSrcFS()
+	if err != nil {
+		logger.Fatalf("%s", err)
+	}
+	dstFS, err := newDstFS()
+	if err != nil {
+		logger.Fatalf("%s", err)
+	}
+	a := &actions.Restore{
+		Concurrency:             *concurrency,
+		Src:                     srcFS,
+		Dst:                     dstFS,
+		SkipBackupCompleteCheck: *skipBackupCompleteCheck,
+	}
+	if err := a.Run(); err != nil {
+		logger.Fatalf("cannot restore from backup: %s", err)
+	}
+}
+
+func usage() {
+	const s = `
+vmrestore restores VictoriaMetrics data from backups made by vmbackup.
+
+See the docs at https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/app/vmrestore/README.md .
+`
+
+	f := flag.CommandLine.Output()
+	fmt.Fprintf(f, "%s\n", s)
+	flag.PrintDefaults()
+}
+
+func newDstFS() (*fslocal.FS, error) {
+	if len(*storageDataPath) == 0 {
+		return nil, fmt.Errorf("`-storageDataPath` cannot be empty")
+	}
+	fs := &fslocal.FS{
+		Dir:               *storageDataPath,
+		MaxBytesPerSecond: *maxBytesPerSecond,
+	}
+	if err := fs.Init(); err != nil {
+		return nil, fmt.Errorf("cannot initialize local fs: %s", err)
+	}
+	return fs, nil
+}
+
+func newSrcFS() (common.RemoteFS, error) {
+	fs, err := actions.NewRemoteFS(*src)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse `-src`=%q: %s", *src, err)
+	}
+	return fs, nil
+}
--- a/app/vmselect/Makefile
+++ b/app/vmselect/Makefile
@@ -0,0 +1,38 @@
+# All these commands must run from repository root.
+
+run-vmselect:
+	mkdir -p vmselect-cache
+	DOCKER_OPTS='-v $(shell pwd)/vmselect-cache:/cache' \
+	APP_NAME=vmselect \
+	ARGS='-storageNode=localhost:8401 -selectNode=localhost:8481 -cacheDataPath=/cache' \
+	$(MAKE) run-via-docker
+
+vmselect:
+	APP_NAME=vmselect $(MAKE) app-local
+
+vmselect-race:
+	APP_NAME=vmselect RACE=-race $(MAKE) app-local
+
+vmselect-prod:
+	APP_NAME=vmselect $(MAKE) app-via-docker
+
+vmselect-pure-prod:
+	APP_NAME=vmselect $(MAKE) app-via-docker-pure
+
+vmselect-prod-race:
+	APP_NAME=vmselect RACE=-race $(MAKE) app-via-docker
+
+vmselect-pure:
+	APP_NAME=vmselect $(MAKE) app-local-pure
+
+package-vmselect:
+	APP_NAME=vmselect $(MAKE) package-via-docker
+
+package-vmselect-race:
+	APP_NAME=vmselect RACE=-race $(MAKE) package-via-docker
+
+publish-vmselect:
+	APP_NAME=vmselect $(MAKE) publish-via-docker
+
+publish-vmselect-race:
+	APP_NAME=vmselect RACE=-race $(MAKE) publish-via-docker
--- a/app/vmselect/README.md
+++ b/app/vmselect/README.md
@@ -1,2 +1,6 @@
-`vmselect` performs the incoming queries and fetches the required data
-from `vmstorage`.
+`vmselect` performs the following tasks:
+
+- Splits incoming selects to tasks for `vmstorage` nodes and issues these tasks
+  to all the `vmstorage` nodes in the cluster.
+
+- Merges responses from all the `vmstorage` nodes and returns a single response.
--- a/app/vmselect/deployment/Dockerfile
+++ b/app/vmselect/deployment/Dockerfile
@@ -0,0 +1,8 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vmselect-prod
+EXPOSE 8481
+ENTRYPOINT ["/vmselect-prod"]
--- a/app/vmselect/main.go
+++ b/app/vmselect/main.go
@@ -1,7 +1,8 @@
-package vmselect
+package main

 import (
 	"flag"
+	"fmt"
 	"net/http"
 	"runtime"
 	"strings"
@@ -10,57 +11,179 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/prometheus"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql"
-	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
 	"github.com/VictoriaMetrics/metrics"
 )

 var (
-	deleteAuthKey         = flag.String("deleteAuthKey", "", "authKey for metrics' deletion via /api/v1/admin/tsdb/delete_series")
-	maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", runtime.GOMAXPROCS(-1)*2, "The maximum number of concurrent search requests. It shouldn't exceed 2*vCPUs for better performance. See also -search.maxQueueDuration")
-	maxQueueDuration      = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached")
+	httpListenAddr        = flag.String("httpListenAddr", ":8481", "Address to listen for http connections")
+	cacheDataPath         = flag.String("cacheDataPath", "", "Path to directory for cache files. Cache isn't saved if empty")
+	maxConcurrentRequests = flag.Int("search.maxConcurrentRequests", getDefaultMaxConcurrentRequests(), "The maximum number of concurrent search requests. "+
+		"It shouldn't be high, since a single request can saturate all the CPU cores. See also -search.maxQueueDuration")
+	maxQueueDuration  = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached")
+	minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
+		"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
+		"Deduplication is disabled if the -dedup.minScrapeInterval is 0")
+	resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetCache call")
+	storageNodes      = flagutil.NewArray("storageNode", "Addresses of vmstorage nodes; usage: -storageNode=vmstorage-host1:8401 -storageNode=vmstorage-host2:8401")
 )

-// Init initializes vmselect
-func Init() {
-	tmpDirPath := *vmstorage.DataPath + "/tmp"
-	fs.RemoveDirContents(tmpDirPath)
-	netstorage.InitTmpBlocksDir(tmpDirPath)
-	promql.InitRollupResultCache(*vmstorage.DataPath + "/cache/rollupResult")
+func getDefaultMaxConcurrentRequests() int {
+	n := runtime.GOMAXPROCS(-1)
+	if n <= 4 {
+		n *= 2
+	}
+	if n > 16 {
+		// A single request can saturate all the CPU cores, so there is no sense
+		// in allowing higher number of concurrent requests - they will just contend
+		// for unavailable CPU time.
+		n = 16
+	}
+	return n
+}
+
+func main() {
+	envflag.Parse()
+	buildinfo.Init()
+	logger.Init()
+
+	logger.Infof("starting netstorage at storageNodes %s", *storageNodes)
+	startTime := time.Now()
+	storage.SetMinScrapeIntervalForDeduplication(*minScrapeInterval)
+	if len(*storageNodes) == 0 {
+		logger.Fatalf("missing -storageNode arg")
+	}
+	netstorage.InitStorageNodes(*storageNodes)
+	logger.Infof("started netstorage in %.3f seconds", time.Since(startTime).Seconds())
+
+	if len(*cacheDataPath) > 0 {
+		tmpDataPath := *cacheDataPath + "/tmp"
+		fs.RemoveDirContents(tmpDataPath)
+		netstorage.InitTmpBlocksDir(tmpDataPath)
+		promql.InitRollupResultCache(*cacheDataPath + "/rollupResult")
+	} else {
+		netstorage.InitTmpBlocksDir("")
+		promql.InitRollupResultCache("")
+	}
 	concurrencyCh = make(chan struct{}, *maxConcurrentRequests)
+
+	go func() {
+		httpserver.Serve(*httpListenAddr, requestHandler)
+	}()
+
+	sig := procutil.WaitForSigterm()
+	logger.Infof("service received signal %s", sig)
+
+	logger.Infof("gracefully shutting down the service at %q", *httpListenAddr)
+	startTime = time.Now()
+	if err := httpserver.Stop(*httpListenAddr); err != nil {
+		logger.Fatalf("cannot stop the service: %s", err)
+	}
+	logger.Infof("successfully shut down the service in %.3f seconds", time.Since(startTime).Seconds())
+
+	logger.Infof("shutting down neststorage...")
+	startTime = time.Now()
+	netstorage.Stop()
+	if len(*cacheDataPath) > 0 {
+		promql.StopRollupResultCache()
+	}
+	logger.Infof("successfully stopped netstorage in %.3f seconds", time.Since(startTime).Seconds())
+
+	fs.MustStopDirRemover()
+
+	logger.Infof("the vmselect has been stopped")
 }

 var concurrencyCh chan struct{}

-// Stop stops vmselect
-func Stop() {
-	promql.StopRollupResultCache()
-}
+var (
+	concurrencyLimitReached = metrics.NewCounter(`vm_concurrent_select_limit_reached_total`)
+	concurrencyLimitTimeout = metrics.NewCounter(`vm_concurrent_select_limit_timeout_total`)

-// RequestHandler handles remote read API requests for Prometheus
-func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
+	_ = metrics.NewGauge(`vm_concurrent_select_capacity`, func() float64 {
+		return float64(cap(concurrencyCh))
+	})
+	_ = metrics.NewGauge(`vm_concurrent_select_current`, func() float64 {
+		return float64(len(concurrencyCh))
+	})
+)
+
+func requestHandler(w http.ResponseWriter, r *http.Request) bool {
+	startTime := time.Now()
 	// Limit the number of concurrent queries.
-	// Sleep for a second until giving up. This should resolve short bursts in requests.
-	t := time.NewTimer(*maxQueueDuration)
 	select {
 	case concurrencyCh <- struct{}{}:
-		t.Stop()
 		defer func() { <-concurrencyCh }()
-	case <-t.C:
-		httpserver.Errorf(w, "cannot handle more than %d concurrent requests", cap(concurrencyCh))
-		return true
+	default:
+		// Sleep for a while until giving up. This should resolve short bursts in requests.
+		concurrencyLimitReached.Inc()
+		t := timerpool.Get(*maxQueueDuration)
+		select {
+		case concurrencyCh <- struct{}{}:
+			timerpool.Put(t)
+			defer func() { <-concurrencyCh }()
+		case <-t.C:
+			timerpool.Put(t)
+			concurrencyLimitTimeout.Inc()
+			err := &httpserver.ErrorWithStatusCode{
+				Err: fmt.Errorf("cannot handle more than %d concurrent search requests during %s; possible solutions: "+
+					"increase `-search.maxQueueDuration`, increase `-search.maxConcurrentRequests`, increase server capacity",
+					*maxConcurrentRequests, *maxQueueDuration),
+				StatusCode: http.StatusServiceUnavailable,
+			}
+			httpserver.Errorf(w, "%s", err)
+			return true
+		}
 	}

 	path := strings.Replace(r.URL.Path, "//", "/", -1)
-	if strings.HasPrefix(path, "/api/v1/label/") {
-		s := r.URL.Path[len("/api/v1/label/"):]
+	if path == "/internal/resetRollupResultCache" {
+		if len(*resetCacheAuthKey) > 0 && r.FormValue("authKey") != *resetCacheAuthKey {
+			sendPrometheusError(w, r, fmt.Errorf("invalid authKey=%q for %q", r.FormValue("authKey"), path))
+			return true
+		}
+		promql.ResetRollupResultCache()
+		return true
+	}
+
+	p, err := httpserver.ParsePath(path)
+	if err != nil {
+		httpserver.Errorf(w, "cannot parse path %q: %s", path, err)
+		return true
+	}
+	at, err := auth.NewToken(p.AuthToken)
+	if err != nil {
+		httpserver.Errorf(w, "auth error: %s", err)
+		return true
+	}
+	switch p.Prefix {
+	case "select":
+		return selectHandler(startTime, w, r, p, at)
+	case "delete":
+		return deleteHandler(startTime, w, r, p, at)
+	default:
+		// This is not our link
+		return false
+	}
+}
+
+func selectHandler(startTime time.Time, w http.ResponseWriter, r *http.Request, p *httpserver.Path, at *auth.Token) bool {
+	if strings.HasPrefix(p.Suffix, "prometheus/api/v1/label/") {
+		s := p.Suffix[len("prometheus/api/v1/label/"):]
 		if strings.HasSuffix(s, "/values") {
 			labelValuesRequests.Inc()
 			labelName := s[:len(s)-len("/values")]
 			httpserver.EnableCORS(w, r)
-			if err := prometheus.LabelValuesHandler(labelName, w, r); err != nil {
+			if err := prometheus.LabelValuesHandler(startTime, at, labelName, w, r); err != nil {
 				labelValuesErrors.Inc()
 				sendPrometheusError(w, r, err)
 				return true
@@ -69,76 +192,105 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 		}
 	}

-	switch path {
-	case "/api/v1/query":
+	switch p.Suffix {
+	case "prometheus/api/v1/query":
 		queryRequests.Inc()
 		httpserver.EnableCORS(w, r)
-		if err := prometheus.QueryHandler(w, r); err != nil {
+		if err := prometheus.QueryHandler(startTime, at, w, r); err != nil {
 			queryErrors.Inc()
 			sendPrometheusError(w, r, err)
 			return true
 		}
 		return true
-	case "/api/v1/query_range":
+	case "prometheus/api/v1/query_range":
 		queryRangeRequests.Inc()
 		httpserver.EnableCORS(w, r)
-		if err := prometheus.QueryRangeHandler(w, r); err != nil {
+		if err := prometheus.QueryRangeHandler(startTime, at, w, r); err != nil {
 			queryRangeErrors.Inc()
 			sendPrometheusError(w, r, err)
 			return true
 		}
 		return true
-	case "/api/v1/series":
+	case "prometheus/api/v1/series":
 		seriesRequests.Inc()
 		httpserver.EnableCORS(w, r)
-		if err := prometheus.SeriesHandler(w, r); err != nil {
+		if err := prometheus.SeriesHandler(startTime, at, w, r); err != nil {
 			seriesErrors.Inc()
 			sendPrometheusError(w, r, err)
 			return true
 		}
 		return true
-	case "/api/v1/series/count":
+	case "prometheus/api/v1/series/count":
 		seriesCountRequests.Inc()
 		httpserver.EnableCORS(w, r)
-		if err := prometheus.SeriesCountHandler(w, r); err != nil {
+		if err := prometheus.SeriesCountHandler(startTime, at, w, r); err != nil {
 			seriesCountErrors.Inc()
 			sendPrometheusError(w, r, err)
 			return true
 		}
 		return true
-	case "/api/v1/labels":
+	case "prometheus/api/v1/labels":
 		labelsRequests.Inc()
 		httpserver.EnableCORS(w, r)
-		if err := prometheus.LabelsHandler(w, r); err != nil {
+		if err := prometheus.LabelsHandler(startTime, at, w, r); err != nil {
 			labelsErrors.Inc()
 			sendPrometheusError(w, r, err)
 			return true
 		}
 		return true
-	case "/api/v1/export":
+	case "prometheus/api/v1/labels/count":
+		labelsCountRequests.Inc()
+		httpserver.EnableCORS(w, r)
+		if err := prometheus.LabelsCountHandler(startTime, at, w, r); err != nil {
+			labelsCountErrors.Inc()
+			sendPrometheusError(w, r, err)
+			return true
+		}
+		return true
+	case "prometheus/api/v1/export":
 		exportRequests.Inc()
-		if err := prometheus.ExportHandler(w, r); err != nil {
+		if err := prometheus.ExportHandler(startTime, at, w, r); err != nil {
 			exportErrors.Inc()
 			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
 			return true
 		}
 		return true
-	case "/federate":
+	case "prometheus/federate":
 		federateRequests.Inc()
-		if err := prometheus.FederateHandler(w, r); err != nil {
+		if err := prometheus.FederateHandler(startTime, at, w, r); err != nil {
 			federateErrors.Inc()
-			httpserver.Errorf(w, "error int %q: %s", r.URL.Path, err)
+			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
 			return true
 		}
 		return true
-	case "/api/v1/admin/tsdb/delete_series":
+	case "prometheus/api/v1/rules":
+		// Return dumb placeholder
+		rulesRequests.Inc()
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprintf(w, "%s", `{"status":"success","data":{"groups":[]}}`)
+		return true
+	case "prometheus/api/v1/alerts":
+		// Return dumb placehloder
+		alertsRequests.Inc()
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprintf(w, "%s", `{"status":"success","data":{"alerts":[]}}`)
+		return true
+	case "prometheus/api/v1/metadata":
+		// Return dumb placeholder
+		metadataRequests.Inc()
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprintf(w, "%s", `{"status":"success","data":{}}`)
+		return true
+	default:
+		return false
+	}
+}
+
+func deleteHandler(startTime time.Time, w http.ResponseWriter, r *http.Request, p *httpserver.Path, at *auth.Token) bool {
+	switch p.Suffix {
+	case "prometheus/api/v1/admin/tsdb/delete_series":
 		deleteRequests.Inc()
-		authKey := r.FormValue("authKey")
-		if authKey != *deleteAuthKey {
-			httpserver.Errorf(w, "invalid authKey %q. It must match the value from -deleteAuthKey command line flag", authKey)
-			return true
-		}
-		if err := prometheus.DeleteHandler(r); err != nil {
+		if err := prometheus.DeleteHandler(startTime, at, r); err != nil {
 			deleteErrors.Inc()
 			httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err)
 			return true
@@ -151,39 +303,49 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 }

 func sendPrometheusError(w http.ResponseWriter, r *http.Request, err error) {
-	logger.Errorf("error in %q: %s", r.URL.Path, err)
+	logger.Errorf("error in %q: %s", r.RequestURI, err)

 	w.Header().Set("Content-Type", "application/json")
-	statusCode := 422
+	statusCode := http.StatusUnprocessableEntity
+	if esc, ok := err.(*httpserver.ErrorWithStatusCode); ok {
+		statusCode = esc.StatusCode
+	}
 	w.WriteHeader(statusCode)
 	prometheus.WriteErrorResponse(w, statusCode, err)
 }

 var (
-	labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/label/{}/values"}`)
-	labelValuesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/label/{}/values"}`)
+	labelValuesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/label/{}/values"}`)
+	labelValuesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/label/{}/values"}`)

-	queryRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/query"}`)
-	queryErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/query"}`)
+	queryRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/query"}`)
+	queryErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/query"}`)

-	queryRangeRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/query_range"}`)
-	queryRangeErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/query_range"}`)
+	queryRangeRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/query_range"}`)
+	queryRangeErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/query_range"}`)

-	seriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/series"}`)
-	seriesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/series"}`)
+	seriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/series"}`)
+	seriesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/series"}`)

-	seriesCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/series/count"}`)
-	seriesCountErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/series/count"}`)
+	seriesCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/series/count"}`)
+	seriesCountErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/series/count"}`)

-	labelsRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels"}`)
-	labelsErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels"}`)
+	labelsRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/labels"}`)
+	labelsErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/labels"}`)

-	deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`)
-	deleteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`)
+	labelsCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/labels/count"}`)
+	labelsCountErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/labels/count"}`)

-	exportRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/export"}`)
-	exportErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/export"}`)
+	deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`)
+	deleteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`)

-	federateRequests = metrics.NewCounter(`vm_http_requests_total{path="/federate"}`)
-	federateErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/federate"}`)
+	exportRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/export"}`)
+	exportErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/export"}`)
+
+	federateRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/federate"}`)
+	federateErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/federate"}`)
+
+	rulesRequests    = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/rules"}`)
+	alertsRequests   = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/alerts"}`)
+	metadataRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/metadata"}`)
 )
--- a/app/vmselect/netstorage/fadvise_darwin.go
+++ b/app/vmselect/netstorage/fadvise_darwin.go
@@ -1,9 +0,0 @@
-package netstorage
-
-import (
-	"os"
-)
-
-func mustFadviseRandomRead(f *os.File) {
-	// Do nothing :)
-}
--- a/app/vmselect/netstorage/fadvise_linux.go
+++ b/app/vmselect/netstorage/fadvise_linux.go
@@ -1,15 +0,0 @@
-package netstorage
-
-import (
-	"os"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
-	"golang.org/x/sys/unix"
-)
-
-func mustFadviseRandomRead(f *os.File) {
-	fd := int(f.Fd())
-	if err := unix.Fadvise(int(fd), 0, 0, unix.FADV_RANDOM|unix.FADV_WILLNEED); err != nil {
-		logger.Panicf("FATAL: error returned from unix.Fadvise(RANDOM|WILLNEED): %s", err)
-	}
-}
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
--- a/app/vmselect/netstorage/tmp_blocks_file.go
+++ b/app/vmselect/netstorage/tmp_blocks_file.go
@@ -1,7 +1,6 @@
 package netstorage

 import (
-	"bufio"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -10,6 +9,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 	"github.com/VictoriaMetrics/metrics"
 )
@@ -22,9 +22,7 @@ func InitTmpBlocksDir(tmpDirPath string) {
 		tmpDirPath = os.TempDir()
 	}
 	tmpBlocksDir = tmpDirPath + "/searchResults"
-	if err := os.RemoveAll(tmpBlocksDir); err != nil {
-		logger.Panicf("FATAL: cannot remove %q: %s", tmpBlocksDir, err)
-	}
+	fs.MustRemoveAll(tmpBlocksDir)
 	if err := fs.MkdirAllIfNotExist(tmpBlocksDir); err != nil {
 		logger.Panicf("FATAL: cannot create %q: %s", tmpBlocksDir, err)
 	}
@@ -32,13 +30,27 @@ func InitTmpBlocksDir(tmpDirPath string) {

 var tmpBlocksDir string

-const maxInmemoryTmpBlocksFile = 512 * 1024
+func maxInmemoryTmpBlocksFile() int {
+	mem := memory.Allowed()
+	maxLen := mem / 1024
+	if maxLen < 64*1024 {
+		return 64 * 1024
+	}
+	if maxLen > 4*1024*1024 {
+		return 4 * 1024 * 1024
+	}
+	return maxLen
+}
+
+var _ = metrics.NewGauge(`vm_tmp_blocks_max_inmemory_file_size_bytes`, func() float64 {
+	return float64(maxInmemoryTmpBlocksFile())
+})

 type tmpBlocksFile struct {
 	buf []byte

-	f  *os.File
-	bw *bufio.Writer
+	f *os.File
+	r *fs.ReaderAt

 	offset uint64
 }
@@ -46,7 +58,9 @@ type tmpBlocksFile struct {
 func getTmpBlocksFile() *tmpBlocksFile {
 	v := tmpBlocksFilePool.Get()
 	if v == nil {
-		return &tmpBlocksFile{}
+		return &tmpBlocksFile{
+			buf: make([]byte, 0, maxInmemoryTmpBlocksFile()),
+		}
 	}
 	return v.(*tmpBlocksFile)
 }
@@ -55,7 +69,7 @@ func putTmpBlocksFile(tbf *tmpBlocksFile) {
 	tbf.MustClose()
 	tbf.buf = tbf.buf[:0]
 	tbf.f = nil
-	tbf.bw = nil
+	tbf.r = nil
 	tbf.offset = 0
 	tmpBlocksFilePool.Put(tbf)
 }
@@ -71,51 +85,34 @@ func (addr tmpBlockAddr) String() string {
 	return fmt.Sprintf("offset %d, size %d", addr.offset, addr.size)
 }

-func getBufioWriter(f *os.File) *bufio.Writer {
-	v := bufioWriterPool.Get()
-	if v == nil {
-		return bufio.NewWriterSize(f, maxInmemoryTmpBlocksFile*2)
-	}
-	bw := v.(*bufio.Writer)
-	bw.Reset(f)
-	return bw
-}
-
-func putBufioWriter(bw *bufio.Writer) {
-	bufioWriterPool.Put(bw)
-}
-
-var bufioWriterPool sync.Pool
-
 var tmpBlocksFilesCreated = metrics.NewCounter(`vm_tmp_blocks_files_created_total`)

-// WriteBlock writes b to tbf.
+// WriteBlockData writes b to tbf.
 //
 // It returns errors since the operation may fail on space shortage
 // and this must be handled.
-func (tbf *tmpBlocksFile) WriteBlock(b *storage.Block) (tmpBlockAddr, error) {
+func (tbf *tmpBlocksFile) WriteBlockData(b []byte) (tmpBlockAddr, error) {
 	var addr tmpBlockAddr
 	addr.offset = tbf.offset
-
-	tbfBufLen := len(tbf.buf)
-	tbf.buf = storage.MarshalBlock(tbf.buf, b)
-	addr.size = len(tbf.buf) - tbfBufLen
+	addr.size = len(b)
 	tbf.offset += uint64(addr.size)
-	if tbf.offset <= maxInmemoryTmpBlocksFile {
+	if len(tbf.buf)+len(b) <= cap(tbf.buf) {
+		// Fast path - the data fits tbf.buf
+		tbf.buf = append(tbf.buf, b...)
 		return addr, nil
 	}

+	// Slow path: flush the data from tbf.buf to file.
 	if tbf.f == nil {
 		f, err := ioutil.TempFile(tmpBlocksDir, "")
 		if err != nil {
 			return addr, err
 		}
 		tbf.f = f
-		tbf.bw = getBufioWriter(f)
 		tmpBlocksFilesCreated.Inc()
 	}
-	_, err := tbf.bw.Write(tbf.buf)
-	tbf.buf = tbf.buf[:0]
+	_, err := tbf.f.Write(tbf.buf)
+	tbf.buf = append(tbf.buf[:0], b...)
 	if err != nil {
 		return addr, fmt.Errorf("cannot write block to %q: %s", tbf.f.Name(), err)
 	}
@@ -126,15 +123,21 @@ func (tbf *tmpBlocksFile) Finalize() error {
 	if tbf.f == nil {
 		return nil
 	}
-
-	err := tbf.bw.Flush()
-	putBufioWriter(tbf.bw)
-	tbf.bw = nil
-	if _, err := tbf.f.Seek(0, 0); err != nil {
-		logger.Panicf("FATAL: cannot seek to the start of file: %s", err)
+	fname := tbf.f.Name()
+	if _, err := tbf.f.Write(tbf.buf); err != nil {
+		return fmt.Errorf("cannot write the remaining %d bytes to %q: %s", len(tbf.buf), fname, err)
 	}
-	mustFadviseRandomRead(tbf.f)
-	return err
+	tbf.buf = tbf.buf[:0]
+	r, err := fs.OpenReaderAt(fname)
+	if err != nil {
+		logger.Panicf("FATAL: cannot open %q: %s", fname, err)
+	}
+	// Hint the OS that the file is read almost sequentiallly.
+	// This should reduce the number of disk seeks, which is important
+	// for HDDs.
+	r.MustFadviseSequentialRead(true)
+	tbf.r = r
+	return nil
 }

 func (tbf *tmpBlocksFile) MustReadBlockAt(dst *storage.Block, addr tmpBlockAddr) {
@@ -145,13 +148,7 @@ func (tbf *tmpBlocksFile) MustReadBlockAt(dst *storage.Block, addr tmpBlockAddr)
 		bb := tmpBufPool.Get()
 		defer tmpBufPool.Put(bb)
 		bb.B = bytesutil.Resize(bb.B, addr.size)
-		n, err := tbf.f.ReadAt(bb.B, int64(addr.offset))
-		if err != nil {
-			logger.Panicf("FATAL: cannot read from %q at %s: %s", tbf.f.Name(), addr, err)
-		}
-		if n != len(bb.B) {
-			logger.Panicf("FATAL: too short number of bytes read at %s; got %d; want %d", addr, n, len(bb.B))
-		}
+		tbf.r.MustReadAt(bb.B, int64(addr.offset))
 		buf = bb.B
 	}
 	tail, err := storage.UnmarshalBlock(dst, buf)
@@ -169,9 +166,9 @@ func (tbf *tmpBlocksFile) MustClose() {
 	if tbf.f == nil {
 		return
 	}
-	if tbf.bw != nil {
-		putBufioWriter(tbf.bw)
-		tbf.bw = nil
+	if tbf.r != nil {
+		// tbf.r could be nil if Finalize wasn't called.
+		tbf.r.MustClose()
 	}
 	fname := tbf.f.Name()

--- a/app/vmselect/netstorage/tmp_blocks_file_test.go
+++ b/app/vmselect/netstorage/tmp_blocks_file_test.go
@@ -30,7 +30,7 @@ func TestTmpBlocksFileSerial(t *testing.T) {
 }

 func TestTmpBlocksFileConcurrent(t *testing.T) {
-	concurrency := 4
+	concurrency := 3
 	ch := make(chan error, concurrency)
 	for i := 0; i < concurrency; i++ {
 		go func() {
@@ -69,7 +69,7 @@ func testTmpBlocksFile() error {
 		_, _, _ = b.MarshalData(0, 0)
 		return &b
 	}
-	for _, size := range []int{1024, 16 * 1024, maxInmemoryTmpBlocksFile / 2, 2 * maxInmemoryTmpBlocksFile} {
+	for _, size := range []int{1024, 16 * 1024, maxInmemoryTmpBlocksFile() / 2, 2 * maxInmemoryTmpBlocksFile()} {
 		err := func() error {
 			tbf := getTmpBlocksFile()
 			defer putTmpBlocksFile(tbf)
@@ -77,9 +77,12 @@ func testTmpBlocksFile() error {
 			// Write blocks until their summary size exceeds `size`.
 			var addrs []tmpBlockAddr
 			var blocks []*storage.Block
+			bb := tmpBufPool.Get()
+			defer tmpBufPool.Put(bb)
 			for tbf.offset < uint64(size) {
 				b := createBlock()
-				addr, err := tbf.WriteBlock(b)
+				bb.B = storage.MarshalBlock(bb.B[:0], b)
+				addr, err := tbf.WriteBlockData(bb.B)
 				if err != nil {
 					return fmt.Errorf("cannot write block at offset %d: %s", tbf.offset, err)
 				}
@@ -94,7 +97,7 @@ func testTmpBlocksFile() error {
 			}

 			// Read blocks in parallel and verify them
-			concurrency := 3
+			concurrency := 2
 			workCh := make(chan int)
 			doneCh := make(chan error)
 			for i := 0; i < concurrency; i++ {
--- a/app/vmselect/prometheus/export.qtpl
+++ b/app/vmselect/prometheus/export.qtpl
@@ -13,7 +13,7 @@
 	{% for i, ts := range rs.Timestamps %}
 		{%z= bb.B %}{% space %}
 		{%f= rs.Values[i] %}{% space %}
-		{%d= int(ts) %}{% newline %}
+		{%dl= ts %}{% newline %}
 	{% endfor %}
 	{% code quicktemplate.ReleaseByteBuffer(bb) %}
 {% endfunc %}
@@ -35,10 +35,10 @@
 		"timestamps":[
 			{% if len(rs.Timestamps) > 0 %}
 				{% code timestamps := rs.Timestamps %}
-				{%d= int(timestamps[0]) %}
+				{%dl= timestamps[0] %}
 				{% code timestamps = timestamps[1:] %}
 				{% for _, ts := range timestamps %}
-					,{%d= int(ts) %}
+					,{%dl= ts %}
 				{% endfor %}
 			{% endif %}
 		]
--- a/app/vmselect/prometheus/export.qtpl.go
+++ b/app/vmselect/prometheus/export.qtpl.go
@@ -49,7 +49,7 @@ func StreamExportPrometheusLine(qw422016 *qt422016.Writer, rs *netstorage.Result
 //line app/vmselect/prometheus/export.qtpl:15
 		qw422016.N().S(` `)
 //line app/vmselect/prometheus/export.qtpl:16
-		qw422016.N().D(int(ts))
+		qw422016.N().DL(ts)
 //line app/vmselect/prometheus/export.qtpl:16
 		qw422016.N().S(`
 `)
@@ -129,7 +129,7 @@ func StreamExportJSONLine(qw422016 *qt422016.Writer, rs *netstorage.Result) {
 		timestamps := rs.Timestamps

 //line app/vmselect/prometheus/export.qtpl:38
-		qw422016.N().D(int(timestamps[0]))
+		qw422016.N().DL(timestamps[0])
 //line app/vmselect/prometheus/export.qtpl:39
 		timestamps = timestamps[1:]

@@ -138,7 +138,7 @@ func StreamExportJSONLine(qw422016 *qt422016.Writer, rs *netstorage.Result) {
 //line app/vmselect/prometheus/export.qtpl:40
 			qw422016.N().S(`,`)
 //line app/vmselect/prometheus/export.qtpl:41
-			qw422016.N().D(int(ts))
+			qw422016.N().DL(ts)
 //line app/vmselect/prometheus/export.qtpl:42
 		}
 //line app/vmselect/prometheus/export.qtpl:43
--- a/app/vmselect/prometheus/federate.qtpl
+++ b/app/vmselect/prometheus/federate.qtpl
@@ -10,7 +10,7 @@
 	{% if len(rs.Timestamps) == 0 || len(rs.Values) == 0 %}{% return %}{% endif %}
 	{%= prometheusMetricName(&rs.MetricName) %}{% space %}
 	{%f= rs.Values[len(rs.Values)-1] %}{% space %}
-	{%d= int(rs.Timestamps[len(rs.Timestamps)-1]) %}{% newline %}
+	{%dl= rs.Timestamps[len(rs.Timestamps)-1] %}{% newline %}
 {% endfunc %}

 {% endstripspace %}
--- a/app/vmselect/prometheus/federate.qtpl.go
+++ b/app/vmselect/prometheus/federate.qtpl.go
@@ -41,7 +41,7 @@ func StreamFederate(qw422016 *qt422016.Writer, rs *netstorage.Result) {
 //line app/vmselect/prometheus/federate.qtpl:12
 	qw422016.N().S(` `)
 //line app/vmselect/prometheus/federate.qtpl:13
-	qw422016.N().D(int(rs.Timestamps[len(rs.Timestamps)-1]))
+	qw422016.N().DL(rs.Timestamps[len(rs.Timestamps)-1])
 //line app/vmselect/prometheus/federate.qtpl:13
 	qw422016.N().S(`
 `)
--- a/app/vmselect/prometheus/labels_count_response.qtpl
+++ b/app/vmselect/prometheus/labels_count_response.qtpl
@@ -0,0 +1,17 @@
+{% import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" %}
+
+{% stripspace %}
+LabelsCountResponse generates response for /api/v1/label_entries .
+{% func LabelsCountResponse(labelEntries []storage.TagEntry) %}
+{
+	"status":"success",
+	"data":{
+		{% for i, e := range labelEntries %}
+			{%q= e.Key %}:{%d= len(e.Values) %}
+			{% if i+1 < len(labelEntries) %},{% endif %}
+		{% endfor %}
+	}
+}
+{% endfunc %}
+
+{% endstripspace %}
--- a/app/vmselect/prometheus/labels_count_response.qtpl.go
+++ b/app/vmselect/prometheus/labels_count_response.qtpl.go
@@ -0,0 +1,74 @@
+// Code generated by qtc from "labels_count_response.qtpl". DO NOT EDIT.
+// See https://github.com/valyala/quicktemplate for details.
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:1
+package prometheus
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:1
+import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+
+// LabelsCountResponse generates response for /api/v1/label_entries .
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:5
+import (
+	qtio422016 "io"
+
+	qt422016 "github.com/valyala/quicktemplate"
+)
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:5
+var (
+	_ = qtio422016.Copy
+	_ = qt422016.AcquireByteBuffer
+)
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:5
+func StreamLabelsCountResponse(qw422016 *qt422016.Writer, labelEntries []storage.TagEntry) {
+//line app/vmselect/prometheus/labels_count_response.qtpl:5
+	qw422016.N().S(`{"status":"success","data":{`)
+//line app/vmselect/prometheus/labels_count_response.qtpl:9
+	for i, e := range labelEntries {
+//line app/vmselect/prometheus/labels_count_response.qtpl:10
+		qw422016.N().Q(e.Key)
+//line app/vmselect/prometheus/labels_count_response.qtpl:10
+		qw422016.N().S(`:`)
+//line app/vmselect/prometheus/labels_count_response.qtpl:10
+		qw422016.N().D(len(e.Values))
+//line app/vmselect/prometheus/labels_count_response.qtpl:11
+		if i+1 < len(labelEntries) {
+//line app/vmselect/prometheus/labels_count_response.qtpl:11
+			qw422016.N().S(`,`)
+//line app/vmselect/prometheus/labels_count_response.qtpl:11
+		}
+//line app/vmselect/prometheus/labels_count_response.qtpl:12
+	}
+//line app/vmselect/prometheus/labels_count_response.qtpl:12
+	qw422016.N().S(`}}`)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+}
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+func WriteLabelsCountResponse(qq422016 qtio422016.Writer, labelEntries []storage.TagEntry) {
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	qw422016 := qt422016.AcquireWriter(qq422016)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	StreamLabelsCountResponse(qw422016, labelEntries)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	qt422016.ReleaseWriter(qw422016)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+}
+
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+func LabelsCountResponse(labelEntries []storage.TagEntry) string {
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	qb422016 := qt422016.AcquireByteBuffer()
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	WriteLabelsCountResponse(qb422016, labelEntries)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	qs422016 := string(qb422016.B)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	qt422016.ReleaseByteBuffer(qb422016)
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+	return qs422016
+//line app/vmselect/prometheus/labels_count_response.qtpl:15
+}
--- a/app/vmselect/prometheus/prometheus.go
+++ b/app/vmselect/prometheus/prometheus.go
--- a/app/vmselect/prometheus/prometheus_test.go
+++ b/app/vmselect/prometheus/prometheus_test.go
@@ -0,0 +1,115 @@
+package prometheus
+
+import (
+	"fmt"
+	"math"
+	"net/http"
+	"net/url"
+	"reflect"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
+)
+
+func TestRemoveNaNValuesInplace(t *testing.T) {
+	f := func(tss []netstorage.Result, tssExpected []netstorage.Result) {
+		t.Helper()
+		removeNaNValuesInplace(tss)
+		if !reflect.DeepEqual(tss, tssExpected) {
+			t.Fatalf("unexpected result; got %v; want %v", tss, tssExpected)
+		}
+	}
+
+	nan := math.NaN()
+
+	f(nil, nil)
+	f([]netstorage.Result{
+		{
+			Timestamps: []int64{100, 200, 300},
+			Values:     []float64{1, 2, 3},
+		},
+		{
+			Timestamps: []int64{100, 200, 300, 400},
+			Values:     []float64{nan, nan, 3, nan},
+		},
+	}, []netstorage.Result{
+		{
+			Timestamps: []int64{100, 200, 300},
+			Values:     []float64{1, 2, 3},
+		},
+		{
+			Timestamps: []int64{300},
+			Values:     []float64{3},
+		},
+	})
+}
+
+func TestGetTimeSuccess(t *testing.T) {
+	f := func(s string, timestampExpected int64) {
+		t.Helper()
+		urlStr := fmt.Sprintf("http://foo.bar/baz?s=%s", url.QueryEscape(s))
+		r, err := http.NewRequest("GET", urlStr, nil)
+		if err != nil {
+			t.Fatalf("unexpected error in NewRequest: %s", err)
+		}
+
+		// Verify defaultValue
+		ts, err := getTime(r, "foo", 123)
+		if err != nil {
+			t.Fatalf("unexpected error when obtaining default time from getTime(%q): %s", s, err)
+		}
+		if ts != 123 {
+			t.Fatalf("unexpected default value for getTime(%q); got %d; want %d", s, ts, 123)
+		}
+
+		// Verify timestampExpected
+		ts, err = getTime(r, "s", 123)
+		if err != nil {
+			t.Fatalf("unexpected error in getTime(%q): %s", s, err)
+		}
+		if ts != timestampExpected {
+			t.Fatalf("unexpected timestamp for getTime(%q); got %d; want %d", s, ts, timestampExpected)
+		}
+	}
+
+	f("2019-07-07T20:01:02Z", 1562529662000)
+	f("2019-07-07T20:47:40+03:00", 1562521660000)
+	f("-292273086-05-16T16:47:06Z", minTimeMsecs)
+	f("292277025-08-18T07:12:54.999999999Z", maxTimeMsecs)
+	f("1562529662.324", 1562529662324)
+	f("-9223372036.854", minTimeMsecs)
+	f("-9223372036.855", minTimeMsecs)
+	f("9223372036.855", maxTimeMsecs)
+}
+
+func TestGetTimeError(t *testing.T) {
+	f := func(s string) {
+		t.Helper()
+		urlStr := fmt.Sprintf("http://foo.bar/baz?s=%s", url.QueryEscape(s))
+		r, err := http.NewRequest("GET", urlStr, nil)
+		if err != nil {
+			t.Fatalf("unexpected error in NewRequest: %s", err)
+		}
+
+		// Verify defaultValue
+		ts, err := getTime(r, "foo", 123)
+		if err != nil {
+			t.Fatalf("unexpected error when obtaining default time from getTime(%q): %s", s, err)
+		}
+		if ts != 123 {
+			t.Fatalf("unexpected default value for getTime(%q); got %d; want %d", s, ts, 123)
+		}
+
+		// Verify timestampExpected
+		_, err = getTime(r, "s", 123)
+		if err == nil {
+			t.Fatalf("expecting non-nil error in getTime(%q)", s)
+		}
+	}
+
+	f("foo")
+	f("2019-07-07T20:01:02Zisdf")
+	f("2019-07-07T20:47:40+03:00123")
+	f("-292273086-05-16T16:47:07Z")
+	f("292277025-08-18T07:12:54.999999998Z")
+}
--- a/app/vmselect/prometheus/series_count_response.qtpl
+++ b/app/vmselect/prometheus/series_count_response.qtpl
@@ -3,7 +3,7 @@ SeriesCountResponse generates response for /api/v1/series/count .
 {% func SeriesCountResponse(n uint64) %}
 {
 	"status":"success",
-	"data":[{%d int(n) %}]
+	"data":[{%dl int64(n) %}]
 }
 {% endfunc %}
 {% endstripspace %}
--- a/app/vmselect/prometheus/series_count_response.qtpl.go
+++ b/app/vmselect/prometheus/series_count_response.qtpl.go
@@ -24,7 +24,7 @@ func StreamSeriesCountResponse(qw422016 *qt422016.Writer, n uint64) {
 //line app/vmselect/prometheus/series_count_response.qtpl:3
 	qw422016.N().S(`{"status":"success","data":[`)
 //line app/vmselect/prometheus/series_count_response.qtpl:6
-	qw422016.N().D(int(n))
+	qw422016.N().DL(int64(n))
 //line app/vmselect/prometheus/series_count_response.qtpl:6
 	qw422016.N().S(`]}`)
 //line app/vmselect/prometheus/series_count_response.qtpl:8
--- a/app/vmselect/promql/aggr.go
+++ b/app/vmselect/promql/aggr.go
@@ -6,6 +6,12 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/metrics"
+	"github.com/valyala/histogram"
 )

 var aggrFuncs = map[string]aggrFunc{
@@ -22,17 +28,28 @@ var aggrFuncs = map[string]aggrFunc{
 	"topk":         newAggrFuncTopK(false),
 	"quantile":     aggrFuncQuantile,

-	// Extended PromQL funcs
-	"median":   aggrFuncMedian,
-	"limitk":   aggrFuncLimitK,
-	"distinct": newAggrFunc(aggrFuncDistinct),
+	// PromQL extension funcs
+	"median":         aggrFuncMedian,
+	"limitk":         aggrFuncLimitK,
+	"distinct":       newAggrFunc(aggrFuncDistinct),
+	"sum2":           newAggrFunc(aggrFuncSum2),
+	"geomean":        newAggrFunc(aggrFuncGeomean),
+	"histogram":      newAggrFunc(aggrFuncHistogram),
+	"topk_min":       newAggrFuncRangeTopK(minValue, false),
+	"topk_max":       newAggrFuncRangeTopK(maxValue, false),
+	"topk_avg":       newAggrFuncRangeTopK(avgValue, false),
+	"topk_median":    newAggrFuncRangeTopK(medianValue, false),
+	"bottomk_min":    newAggrFuncRangeTopK(minValue, true),
+	"bottomk_max":    newAggrFuncRangeTopK(maxValue, true),
+	"bottomk_avg":    newAggrFuncRangeTopK(avgValue, true),
+	"bottomk_median": newAggrFuncRangeTopK(medianValue, true),
 }

 type aggrFunc func(afa *aggrFuncArg) ([]*timeseries, error)

 type aggrFuncArg struct {
 	args [][]*timeseries
-	ae   *aggrFuncExpr
+	ae   *metricsql.AggrFuncExpr
 	ec   *EvalConfig
 }

@@ -41,20 +58,6 @@ func getAggrFunc(s string) aggrFunc {
 	return aggrFuncs[s]
 }

-func isAggrFunc(s string) bool {
-	return getAggrFunc(s) != nil
-}
-
-func isAggrFuncModifier(s string) bool {
-	s = strings.ToLower(s)
-	switch s {
-	case "by", "without":
-		return true
-	default:
-		return false
-	}
-}
-
 func newAggrFunc(afe func(tss []*timeseries) []*timeseries) aggrFunc {
 	return func(afa *aggrFuncArg) ([]*timeseries, error) {
 		args := afa.args
@@ -65,33 +68,26 @@ func newAggrFunc(afe func(tss []*timeseries) []*timeseries) aggrFunc {
 	}
 }

-func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *modifierExpr, keepOriginal bool) ([]*timeseries, error) {
-	arg := copyTimeseriesMetricNames(argOrig)
-
-	// Filter out superflouos tags.
-	var groupTags []string
-	groupOp := "by"
-	if modifier.Op != "" {
-		groupTags = modifier.Args
-		groupOp = strings.ToLower(modifier.Op)
-	}
+func removeGroupTags(metricName *storage.MetricName, modifier *metricsql.ModifierExpr) {
+	groupOp := strings.ToLower(modifier.Op)
 	switch groupOp {
-	case "by":
-		for _, ts := range arg {
-			ts.MetricName.RemoveTagsOn(groupTags)
-		}
+	case "", "by":
+		metricName.RemoveTagsOn(modifier.Args)
 	case "without":
-		for _, ts := range arg {
-			ts.MetricName.RemoveTagsIgnoring(groupTags)
-		}
+		metricName.RemoveTagsIgnoring(modifier.Args)
 	default:
-		return nil, fmt.Errorf(`unknown modifier: %q`, groupOp)
+		logger.Panicf("BUG: unknown group modifier: %q", groupOp)
 	}
+}
+
+func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, keepOriginal bool) ([]*timeseries, error) {
+	arg := copyTimeseriesMetricNames(argOrig)

 	// Perform grouping.
 	m := make(map[string][]*timeseries)
 	bb := bbPool.Get()
 	for i, ts := range arg {
+		removeGroupTags(&ts.MetricName, modifier)
 		bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
 		if keepOriginal {
 			ts = argOrig[i]
@@ -100,10 +96,18 @@ func aggrFuncExt(afe func(tss []*timeseries) []*timeseries, argOrig []*timeserie
 	}
 	bbPool.Put(bb)

+	srcTssCount := 0
+	dstTssCount := 0
 	rvs := make([]*timeseries, 0, len(m))
 	for _, tss := range m {
 		rv := afe(tss)
 		rvs = append(rvs, rv...)
+		srcTssCount += len(tss)
+		dstTssCount += len(rv)
+		if dstTssCount > 2000 && dstTssCount > 16*srcTssCount {
+			// This looks like count_values explosion.
+			return nil, fmt.Errorf(`too many timeseries after aggragation; got %d; want less than %d`, dstTssCount, 16*srcTssCount)
+		}
 	}
 	return rvs, nil
 }
@@ -132,6 +136,84 @@ func aggrFuncSum(tss []*timeseries) []*timeseries {
 	return tss[:1]
 }

+func aggrFuncSum2(tss []*timeseries) []*timeseries {
+	dst := tss[0]
+	for i := range dst.Values {
+		sum2 := float64(0)
+		count := 0
+		for _, ts := range tss {
+			v := ts.Values[i]
+			if math.IsNaN(v) {
+				continue
+			}
+			sum2 += v * v
+			count++
+		}
+		if count == 0 {
+			sum2 = nan
+		}
+		dst.Values[i] = sum2
+	}
+	return tss[:1]
+}
+
+func aggrFuncGeomean(tss []*timeseries) []*timeseries {
+	if len(tss) == 1 {
+		// Fast path - nothing to geomean.
+		return tss
+	}
+	dst := tss[0]
+	for i := range dst.Values {
+		p := 1.0
+		count := 0
+		for _, ts := range tss {
+			v := ts.Values[i]
+			if math.IsNaN(v) {
+				continue
+			}
+			p *= v
+			count++
+		}
+		if count == 0 {
+			p = nan
+		}
+		dst.Values[i] = math.Pow(p, 1/float64(count))
+	}
+	return tss[:1]
+}
+
+func aggrFuncHistogram(tss []*timeseries) []*timeseries {
+	var h metrics.Histogram
+	m := make(map[string]*timeseries)
+	for i := range tss[0].Values {
+		h.Reset()
+		for _, ts := range tss {
+			v := ts.Values[i]
+			h.Update(v)
+		}
+		h.VisitNonZeroBuckets(func(vmrange string, count uint64) {
+			ts := m[vmrange]
+			if ts == nil {
+				ts = &timeseries{}
+				ts.CopyFromShallowTimestamps(tss[0])
+				ts.MetricName.RemoveTag("vmrange")
+				ts.MetricName.AddTag("vmrange", vmrange)
+				values := ts.Values
+				for k := range values {
+					values[k] = 0
+				}
+				m[vmrange] = ts
+			}
+			ts.Values[i] = float64(count)
+		})
+	}
+	rvs := make([]*timeseries, 0, len(m))
+	for _, ts := range m {
+		rvs = append(rvs, ts)
+	}
+	return vmrangeBucketsToLE(rvs)
+}
+
 func aggrFuncMin(tss []*timeseries) []*timeseries {
 	if len(tss) == 1 {
 		// Fast path - nothing to min.
@@ -260,7 +342,11 @@ func aggrFuncCount(tss []*timeseries) []*timeseries {
 			}
 			count++
 		}
-		dst.Values[i] = float64(count)
+		v := float64(count)
+		if count == 0 {
+			v = nan
+		}
+		dst.Values[i] = v
 	}
 	return tss[:1]
 }
@@ -297,10 +383,32 @@ func aggrFuncCountValues(afa *aggrFuncArg) ([]*timeseries, error) {
 	if err != nil {
 		return nil, err
 	}
+
+	// Remove dstLabel from grouping like Prometheus does.
+	modifier := &afa.ae.Modifier
+	switch strings.ToLower(modifier.Op) {
+	case "without":
+		modifier.Args = append(modifier.Args, dstLabel)
+	case "by":
+		dstArgs := modifier.Args[:0]
+		for _, arg := range modifier.Args {
+			if arg == dstLabel {
+				continue
+			}
+			dstArgs = append(dstArgs, arg)
+		}
+		modifier.Args = dstArgs
+	default:
+		// Do nothing
+	}
+
 	afe := func(tss []*timeseries) []*timeseries {
 		m := make(map[float64]bool)
 		for _, ts := range tss {
 			for _, v := range ts.Values {
+				if math.IsNaN(v) {
+					continue
+				}
 				m[v] = true
 			}
 		}
@@ -313,7 +421,7 @@ func aggrFuncCountValues(afa *aggrFuncArg) ([]*timeseries, error) {
 		var rvs []*timeseries
 		for _, v := range values {
 			var dst timeseries
-			dst.CopyFrom(tss[0])
+			dst.CopyFromShallowTimestamps(tss[0])
 			dst.MetricName.RemoveTag(dstLabel)
 			dst.MetricName.AddTag(dstLabel, strconv.FormatFloat(v, 'g', -1, 64))
 			for i := range dst.Values {
@@ -347,37 +455,138 @@ func newAggrFuncTopK(isReverse bool) aggrFunc {
 			return nil, err
 		}
 		afe := func(tss []*timeseries) []*timeseries {
-			rvs := tss
-			for n := range rvs[0].Values {
-				sort.Slice(rvs, func(i, j int) bool {
-					a := rvs[i].Values[n]
-					b := rvs[j].Values[n]
-					cmp := lessWithNaNs(a, b)
+			for n := range tss[0].Values {
+				sort.Slice(tss, func(i, j int) bool {
+					a := tss[i].Values[n]
+					b := tss[j].Values[n]
 					if isReverse {
-						cmp = !cmp
+						a, b = b, a
 					}
-					return cmp
+					return lessWithNaNs(a, b)
 				})
-				if math.IsNaN(ks[n]) {
-					ks[n] = 0
-				}
-				k := int(ks[n])
-				if k < 0 {
-					k = 0
-				}
-				if k > len(rvs) {
-					k = len(rvs)
-				}
-				for _, ts := range rvs[:len(rvs)-k] {
-					ts.Values[n] = nan
-				}
+				fillNaNsAtIdx(n, ks[n], tss)
 			}
-			return rvs
+			return removeNaNs(tss)
 		}
 		return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
 	}
 }

+type tsWithValue struct {
+	ts    *timeseries
+	value float64
+}
+
+func newAggrFuncRangeTopK(f func(values []float64) float64, isReverse bool) aggrFunc {
+	return func(afa *aggrFuncArg) ([]*timeseries, error) {
+		args := afa.args
+		if err := expectTransformArgsNum(args, 2); err != nil {
+			return nil, err
+		}
+		ks, err := getScalar(args[0], 0)
+		if err != nil {
+			return nil, err
+		}
+		afe := func(tss []*timeseries) []*timeseries {
+			maxs := make([]tsWithValue, len(tss))
+			for i, ts := range tss {
+				value := f(ts.Values)
+				maxs[i] = tsWithValue{
+					ts:    ts,
+					value: value,
+				}
+			}
+			sort.Slice(maxs, func(i, j int) bool {
+				a := maxs[i].value
+				b := maxs[j].value
+				if isReverse {
+					a, b = b, a
+				}
+				return lessWithNaNs(a, b)
+			})
+			for i := range maxs {
+				tss[i] = maxs[i].ts
+			}
+			for i, k := range ks {
+				fillNaNsAtIdx(i, k, tss)
+			}
+			return removeNaNs(tss)
+		}
+		return aggrFuncExt(afe, args[1], &afa.ae.Modifier, true)
+	}
+}
+
+func fillNaNsAtIdx(idx int, k float64, tss []*timeseries) {
+	if math.IsNaN(k) {
+		k = 0
+	}
+	kn := int(k)
+	if kn < 0 {
+		kn = 0
+	}
+	if kn > len(tss) {
+		kn = len(tss)
+	}
+	for _, ts := range tss[:len(tss)-kn] {
+		ts.Values[idx] = nan
+	}
+}
+
+func minValue(values []float64) float64 {
+	if len(values) == 0 {
+		return nan
+	}
+	min := values[0]
+	for _, v := range values[1:] {
+		if v < min {
+			min = v
+		}
+	}
+	return min
+}
+
+func maxValue(values []float64) float64 {
+	if len(values) == 0 {
+		return nan
+	}
+	max := values[0]
+	for _, v := range values[1:] {
+		if v > max {
+			max = v
+		}
+	}
+	return max
+}
+
+func avgValue(values []float64) float64 {
+	sum := float64(0)
+	count := 0
+	for _, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		count++
+		sum += v
+	}
+	if count == 0 {
+		return nan
+	}
+	return sum / float64(count)
+}
+
+func medianValue(values []float64) float64 {
+	h := histogram.GetFast()
+	for _, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		h.Update(v)
+	}
+	value := h.Quantile(0.5)
+	histogram.PutFast(h)
+	return value
+}
+
 func aggrFuncLimitK(afa *aggrFuncArg) ([]*timeseries, error) {
 	args := afa.args
 	if err := expectTransformArgsNum(args, 2); err != nil {
@@ -457,6 +666,7 @@ func newAggrQuantileFunc(phis []float64) func(tss []*timeseries) []*timeseries {
 			idx := int(math.Round(float64(len(tss)-1) * phi))
 			dst.Values[n] = tss[idx].Values[n]
 		}
+		tss[0] = dst
 		return tss[:1]
 	}
 }
--- a/app/vmselect/promql/aggr_incremental.go
+++ b/app/vmselect/promql/aggr_incremental.go
@@ -0,0 +1,452 @@
+package promql
+
+import (
+	"math"
+	"strings"
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
+)
+
+// callbacks for optimized incremental calculations for aggregate functions
+// over rollups over metricsql.MetricExpr.
+//
+// These calculations save RAM for aggregates over big number of time series.
+var incrementalAggrFuncCallbacksMap = map[string]*incrementalAggrFuncCallbacks{
+	"sum": {
+		updateAggrFunc:   updateAggrSum,
+		mergeAggrFunc:    mergeAggrSum,
+		finalizeAggrFunc: finalizeAggrCommon,
+	},
+	"min": {
+		updateAggrFunc:   updateAggrMin,
+		mergeAggrFunc:    mergeAggrMin,
+		finalizeAggrFunc: finalizeAggrCommon,
+	},
+	"max": {
+		updateAggrFunc:   updateAggrMax,
+		mergeAggrFunc:    mergeAggrMax,
+		finalizeAggrFunc: finalizeAggrCommon,
+	},
+	"avg": {
+		updateAggrFunc:   updateAggrAvg,
+		mergeAggrFunc:    mergeAggrAvg,
+		finalizeAggrFunc: finalizeAggrAvg,
+	},
+	"count": {
+		updateAggrFunc:   updateAggrCount,
+		mergeAggrFunc:    mergeAggrCount,
+		finalizeAggrFunc: finalizeAggrCount,
+	},
+	"sum2": {
+		updateAggrFunc:   updateAggrSum2,
+		mergeAggrFunc:    mergeAggrSum2,
+		finalizeAggrFunc: finalizeAggrCommon,
+	},
+	"geomean": {
+		updateAggrFunc:   updateAggrGeomean,
+		mergeAggrFunc:    mergeAggrGeomean,
+		finalizeAggrFunc: finalizeAggrGeomean,
+	},
+}
+
+type incrementalAggrFuncContext struct {
+	ae *metricsql.AggrFuncExpr
+
+	mLock sync.Mutex
+	m     map[uint]map[string]*incrementalAggrContext
+
+	callbacks *incrementalAggrFuncCallbacks
+}
+
+func newIncrementalAggrFuncContext(ae *metricsql.AggrFuncExpr, callbacks *incrementalAggrFuncCallbacks) *incrementalAggrFuncContext {
+	return &incrementalAggrFuncContext{
+		ae:        ae,
+		m:         make(map[uint]map[string]*incrementalAggrContext),
+		callbacks: callbacks,
+	}
+}
+
+func (iafc *incrementalAggrFuncContext) updateTimeseries(ts *timeseries, workerID uint) {
+	iafc.mLock.Lock()
+	m := iafc.m[workerID]
+	if m == nil {
+		m = make(map[string]*incrementalAggrContext, 1)
+		iafc.m[workerID] = m
+	}
+	iafc.mLock.Unlock()
+
+	removeGroupTags(&ts.MetricName, &iafc.ae.Modifier)
+	bb := bbPool.Get()
+	bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
+	iac := m[string(bb.B)]
+	if iac == nil {
+		tsAggr := &timeseries{
+			Values:     make([]float64, len(ts.Values)),
+			Timestamps: ts.Timestamps,
+			denyReuse:  true,
+		}
+		tsAggr.MetricName.CopyFrom(&ts.MetricName)
+		iac = &incrementalAggrContext{
+			ts:     tsAggr,
+			values: make([]float64, len(ts.Values)),
+		}
+		m[string(bb.B)] = iac
+	}
+	bbPool.Put(bb)
+	iafc.callbacks.updateAggrFunc(iac, ts.Values)
+}
+
+func (iafc *incrementalAggrFuncContext) finalizeTimeseries() []*timeseries {
+	// There is no need in iafc.mLock.Lock here, since finalizeTimeseries must be called
+	// without concurrent goroutines touching iafc.
+	mGlobal := make(map[string]*incrementalAggrContext)
+	mergeAggrFunc := iafc.callbacks.mergeAggrFunc
+	for _, m := range iafc.m {
+		for k, iac := range m {
+			iacGlobal := mGlobal[k]
+			if iacGlobal == nil {
+				mGlobal[k] = iac
+				continue
+			}
+			mergeAggrFunc(iacGlobal, iac)
+		}
+	}
+	tss := make([]*timeseries, 0, len(mGlobal))
+	finalizeAggrFunc := iafc.callbacks.finalizeAggrFunc
+	for _, iac := range mGlobal {
+		finalizeAggrFunc(iac)
+		tss = append(tss, iac.ts)
+	}
+	return tss
+}
+
+type incrementalAggrFuncCallbacks struct {
+	updateAggrFunc   func(iac *incrementalAggrContext, values []float64)
+	mergeAggrFunc    func(dst, src *incrementalAggrContext)
+	finalizeAggrFunc func(iac *incrementalAggrContext)
+}
+
+func getIncrementalAggrFuncCallbacks(name string) *incrementalAggrFuncCallbacks {
+	name = strings.ToLower(name)
+	return incrementalAggrFuncCallbacksMap[name]
+}
+
+type incrementalAggrContext struct {
+	ts     *timeseries
+	values []float64
+}
+
+func finalizeAggrCommon(iac *incrementalAggrContext) {
+	counts := iac.values
+	dstValues := iac.ts.Values
+	_ = dstValues[len(counts)-1]
+	for i, v := range counts {
+		if v == 0 {
+			dstValues[i] = nan
+		}
+	}
+}
+
+func updateAggrSum(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] += v
+	}
+}
+
+func mergeAggrSum(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] += v
+	}
+}
+
+func updateAggrMin(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		if v < dstValues[i] {
+			dstValues[i] = v
+		}
+	}
+}
+
+func mergeAggrMin(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		if v < dstValues[i] {
+			dstValues[i] = v
+		}
+	}
+}
+
+func updateAggrMax(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		if v > dstValues[i] {
+			dstValues[i] = v
+		}
+	}
+}
+
+func mergeAggrMax(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		if v > dstValues[i] {
+			dstValues[i] = v
+		}
+	}
+}
+
+func updateAggrAvg(iac *incrementalAggrContext, values []float64) {
+	// Do not use `Rapid calculation methods` at https://en.wikipedia.org/wiki/Standard_deviation,
+	// since it is slower and has no obvious benefits in increased precision.
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] += v
+		dstCounts[i]++
+	}
+}
+
+func mergeAggrAvg(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = srcCounts[i]
+			continue
+		}
+		dstValues[i] += v
+		dstCounts[i] += srcCounts[i]
+	}
+}
+
+func finalizeAggrAvg(iac *incrementalAggrContext) {
+	dstValues := iac.ts.Values
+	counts := iac.values
+	_ = dstValues[len(counts)-1]
+	for i, v := range counts {
+		if v == 0 {
+			dstValues[i] = nan
+			continue
+		}
+		dstValues[i] /= v
+	}
+}
+
+func updateAggrCount(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	_ = dstValues[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		dstValues[i]++
+	}
+}
+
+func mergeAggrCount(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		dstValues[i] += v
+	}
+}
+
+func finalizeAggrCount(iac *incrementalAggrContext) {
+	dstValues := iac.ts.Values
+	for i, v := range dstValues {
+		if v == 0 {
+			dstValues[i] = nan
+		}
+	}
+}
+
+func updateAggrSum2(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v * v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] += v * v
+	}
+}
+
+func mergeAggrSum2(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] += v
+	}
+}
+
+func updateAggrGeomean(iac *incrementalAggrContext, values []float64) {
+	dstValues := iac.ts.Values
+	dstCounts := iac.values
+	_ = dstValues[len(values)-1]
+	_ = dstCounts[len(values)-1]
+	for i, v := range values {
+		if math.IsNaN(v) {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = 1
+			continue
+		}
+		dstValues[i] *= v
+		dstCounts[i]++
+	}
+}
+
+func mergeAggrGeomean(dst, src *incrementalAggrContext) {
+	srcValues := src.ts.Values
+	dstValues := dst.ts.Values
+	srcCounts := src.values
+	dstCounts := dst.values
+	_ = srcCounts[len(srcValues)-1]
+	_ = dstCounts[len(srcValues)-1]
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if srcCounts[i] == 0 {
+			continue
+		}
+		if dstCounts[i] == 0 {
+			dstValues[i] = v
+			dstCounts[i] = srcCounts[i]
+			continue
+		}
+		dstValues[i] *= v
+		dstCounts[i] += srcCounts[i]
+	}
+}
+
+func finalizeAggrGeomean(iac *incrementalAggrContext) {
+	dstValues := iac.ts.Values
+	counts := iac.values
+	_ = dstValues[len(counts)-1]
+	for i, v := range counts {
+		if v == 0 {
+			dstValues[i] = nan
+			continue
+		}
+		dstValues[i] = math.Pow(dstValues[i], 1/v)
+	}
+}
--- a/app/vmselect/promql/aggr_incremental_test.go
+++ b/app/vmselect/promql/aggr_incremental_test.go
@@ -0,0 +1,190 @@
+package promql
+
+import (
+	"fmt"
+	"math"
+	"reflect"
+	"runtime"
+	"sync"
+	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
+)
+
+func TestIncrementalAggr(t *testing.T) {
+	defaultTimestamps := []int64{100e3, 200e3, 300e3, 400e3}
+	values := [][]float64{
+		{1, nan, 2, nan},
+		{3, nan, nan, 4},
+		{nan, nan, 5, 6},
+		{7, nan, 8, 9},
+		{4, nan, nan, nan},
+		{2, nan, 3, 2},
+		{0, nan, 1, 1},
+	}
+	tssSrc := make([]*timeseries, len(values))
+	for i, vs := range values {
+		ts := &timeseries{
+			Timestamps: defaultTimestamps,
+			Values:     vs,
+		}
+		tssSrc[i] = ts
+	}
+
+	copyTimeseries := func(tssSrc []*timeseries) []*timeseries {
+		tssDst := make([]*timeseries, len(tssSrc))
+		for i, tsSrc := range tssSrc {
+			var tsDst timeseries
+			tsDst.CopyFromShallowTimestamps(tsSrc)
+			tssDst[i] = &tsDst
+		}
+		return tssDst
+	}
+
+	f := func(name string, valuesExpected []float64) {
+		t.Helper()
+		callbacks := getIncrementalAggrFuncCallbacks(name)
+		ae := &metricsql.AggrFuncExpr{
+			Name: name,
+		}
+		tssExpected := []*timeseries{{
+			Timestamps: defaultTimestamps,
+			Values:     valuesExpected,
+		}}
+		// run the test multiple times to make sure there are no side effects on concurrency
+		for i := 0; i < 10; i++ {
+			iafc := newIncrementalAggrFuncContext(ae, callbacks)
+			tssSrcCopy := copyTimeseries(tssSrc)
+			if err := testIncrementalParallelAggr(iafc, tssSrcCopy, tssExpected); err != nil {
+				t.Fatalf("unexpected error on iteration %d: %s", i, err)
+			}
+		}
+	}
+
+	t.Run("sum", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{17, nan, 19, 22}
+		f("sum", valuesExpected)
+	})
+	t.Run("min", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{0, nan, 1, 1}
+		f("min", valuesExpected)
+	})
+	t.Run("max", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{7, nan, 8, 9}
+		f("max", valuesExpected)
+	})
+	t.Run("avg", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{2.8333333333333335, nan, 3.8, 4.4}
+		f("avg", valuesExpected)
+	})
+	t.Run("count", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{6, nan, 5, 5}
+		f("count", valuesExpected)
+	})
+	t.Run("sum2", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{79, nan, 103, 138}
+		f("sum2", valuesExpected)
+	})
+	t.Run("geomean", func(t *testing.T) {
+		t.Parallel()
+		valuesExpected := []float64{0, nan, 2.9925557394776896, 3.365865436338599}
+		f("geomean", valuesExpected)
+	})
+}
+
+func testIncrementalParallelAggr(iafc *incrementalAggrFuncContext, tssSrc, tssExpected []*timeseries) error {
+	const workersCount = 3
+	tsCh := make(chan *timeseries)
+	var wg sync.WaitGroup
+	wg.Add(workersCount)
+	for i := 0; i < workersCount; i++ {
+		go func(workerID uint) {
+			defer wg.Done()
+			for ts := range tsCh {
+				runtime.Gosched() // allow other goroutines performing the work
+				iafc.updateTimeseries(ts, workerID)
+			}
+		}(uint(i))
+	}
+	for _, ts := range tssSrc {
+		tsCh <- ts
+	}
+	close(tsCh)
+	wg.Wait()
+	tssActual := iafc.finalizeTimeseries()
+	if err := expectTimeseriesEqual(tssActual, tssExpected); err != nil {
+		return fmt.Errorf("%s; tssActual=%v, tssExpected=%v", err, tssActual, tssExpected)
+	}
+	return nil
+}
+
+func expectTimeseriesEqual(actual, expected []*timeseries) error {
+	if len(actual) != len(expected) {
+		return fmt.Errorf("unexpected number of time series; got %d; want %d", len(actual), len(expected))
+	}
+	mActual := timeseriesToMap(actual)
+	mExpected := timeseriesToMap(expected)
+	if len(mActual) != len(mExpected) {
+		return fmt.Errorf("unexpected number of time series after converting to map; got %d; want %d", len(mActual), len(mExpected))
+	}
+	for k, tsExpected := range mExpected {
+		tsActual := mActual[k]
+		if tsActual == nil {
+			return fmt.Errorf("missing time series for key=%q", k)
+		}
+		if err := expectTsEqual(tsActual, tsExpected); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func timeseriesToMap(tss []*timeseries) map[string]*timeseries {
+	m := make(map[string]*timeseries, len(tss))
+	for _, ts := range tss {
+		k := ts.MetricName.Marshal(nil)
+		m[string(k)] = ts
+	}
+	return m
+}
+
+func expectTsEqual(actual, expected *timeseries) error {
+	mnActual := actual.MetricName.Marshal(nil)
+	mnExpected := expected.MetricName.Marshal(nil)
+	if string(mnActual) != string(mnExpected) {
+		return fmt.Errorf("unexpected metric name; got %q; want %q", mnActual, mnExpected)
+	}
+	if !reflect.DeepEqual(actual.Timestamps, expected.Timestamps) {
+		return fmt.Errorf("unexpected timestamps; got %v; want %v", actual.Timestamps, expected.Timestamps)
+	}
+	if err := compareValues(actual.Values, expected.Values); err != nil {
+		return fmt.Errorf("%s; actual %v; expected %v", err, actual.Values, expected.Values)
+	}
+	return nil
+}
+
+func compareValues(vs1, vs2 []float64) error {
+	if len(vs1) != len(vs2) {
+		return fmt.Errorf("unexpected number of values; got %d; want %d", len(vs1), len(vs2))
+	}
+	for i, v1 := range vs1 {
+		v2 := vs2[i]
+		if math.IsNaN(v1) {
+			if !math.IsNaN(v2) {
+				return fmt.Errorf("unexpected value; got %v; want %v", v1, v2)
+			}
+			continue
+		}
+		eps := math.Abs(v1 - v2)
+		if eps > 1e-14 {
+			return fmt.Errorf("unexpected value; got %v; want %v", v1, v2)
+		}
+	}
+	return nil
+}
--- a/app/vmselect/promql/arch.go
+++ b/app/vmselect/promql/arch.go
@@ -0,0 +1,5 @@
+package promql
+
+import "unsafe"
+
+const maxByteSliceLen = 1<<(31+9*(unsafe.Sizeof(int(0))/8)) - 1
--- a/app/vmselect/promql/arch_amd64.go
+++ b/app/vmselect/promql/arch_amd64.go
@@ -1,3 +0,0 @@
-package promql
-
-const maxByteSliceLen = 1 << 40
--- a/app/vmselect/promql/arch_arm.go
+++ b/app/vmselect/promql/arch_arm.go
@@ -1,3 +0,0 @@
-package promql
-
-const maxByteSliceLen = 1<<31 - 1
--- a/app/vmselect/promql/binary_op.go
+++ b/app/vmselect/promql/binary_op.go
@@ -6,63 +6,36 @@ import (
 	"strings"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql/binaryop"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 )

 var binaryOpFuncs = map[string]binaryOpFunc{
-	"+": newBinaryOpArithFunc(binaryOpPlus),
-	"-": newBinaryOpArithFunc(binaryOpMinus),
-	"*": newBinaryOpArithFunc(binaryOpMul),
-	"/": newBinaryOpArithFunc(binaryOpDiv),
-	"%": newBinaryOpArithFunc(binaryOpMod),
-	"^": newBinaryOpArithFunc(binaryOpPow),
+	"+": newBinaryOpArithFunc(binaryop.Plus),
+	"-": newBinaryOpArithFunc(binaryop.Minus),
+	"*": newBinaryOpArithFunc(binaryop.Mul),
+	"/": newBinaryOpArithFunc(binaryop.Div),
+	"%": newBinaryOpArithFunc(binaryop.Mod),
+	"^": newBinaryOpArithFunc(binaryop.Pow),

 	// cmp ops
-	"==": newBinaryOpCmpFunc(binaryOpEq),
-	"!=": newBinaryOpCmpFunc(binaryOpNeq),
-	">":  newBinaryOpCmpFunc(binaryOpGt),
-	"<":  newBinaryOpCmpFunc(binaryOpLt),
-	">=": newBinaryOpCmpFunc(binaryOpGte),
-	"<=": newBinaryOpCmpFunc(binaryOpLte),
+	"==": newBinaryOpCmpFunc(binaryop.Eq),
+	"!=": newBinaryOpCmpFunc(binaryop.Neq),
+	">":  newBinaryOpCmpFunc(binaryop.Gt),
+	"<":  newBinaryOpCmpFunc(binaryop.Lt),
+	">=": newBinaryOpCmpFunc(binaryop.Gte),
+	"<=": newBinaryOpCmpFunc(binaryop.Lte),

 	// logical set ops
 	"and":    binaryOpAnd,
 	"or":     binaryOpOr,
 	"unless": binaryOpUnless,

-	// New op
-	"if":      newBinaryOpArithFunc(binaryOpIf),
-	"ifnot":   newBinaryOpArithFunc(binaryOpIfnot),
-	"default": newBinaryOpArithFunc(binaryOpDefault),
-}
-
-var binaryOpPriorities = map[string]int{
-	"default": -1,
-
-	"if":    0,
-	"ifnot": 0,
-
-	// See https://prometheus.io/docs/prometheus/latest/querying/operators/#binary-operator-precedence
-	"or": 1,
-
-	"and":    2,
-	"unless": 2,
-
-	"==": 3,
-	"!=": 3,
-	"<":  3,
-	">":  3,
-	"<=": 3,
-	">=": 3,
-
-	"+": 4,
-	"-": 4,
-
-	"*": 5,
-	"/": 5,
-	"%": 5,
-
-	"^": 6,
+	// New ops
+	"if":      newBinaryOpArithFunc(binaryop.If),
+	"ifnot":   newBinaryOpArithFunc(binaryop.Ifnot),
+	"default": newBinaryOpArithFunc(binaryop.Default),
 }

 func getBinaryOpFunc(op string) binaryOpFunc {
@@ -70,144 +43,8 @@ func getBinaryOpFunc(op string) binaryOpFunc {
 	return binaryOpFuncs[op]
 }

-func isBinaryOp(op string) bool {
-	return getBinaryOpFunc(op) != nil
-}
-
-func binaryOpPriority(op string) int {
-	op = strings.ToLower(op)
-	return binaryOpPriorities[op]
-}
-
-func scanBinaryOpPrefix(s string) int {
-	n := 0
-	for op := range binaryOpFuncs {
-		if len(s) < len(op) {
-			continue
-		}
-		ss := strings.ToLower(s[:len(op)])
-		if ss == op && len(op) > n {
-			n = len(op)
-		}
-	}
-	return n
-}
-
-func isRightAssociativeBinaryOp(op string) bool {
-	// See https://prometheus.io/docs/prometheus/latest/querying/operators/#binary-operator-precedence
-	return op == "^"
-}
-
-func isBinaryOpGroupModifier(s string) bool {
-	s = strings.ToLower(s)
-	switch s {
-	// See https://prometheus.io/docs/prometheus/latest/querying/operators/#vector-matching
-	case "on", "ignoring":
-		return true
-	default:
-		return false
-	}
-}
-
-func isBinaryOpJoinModifier(s string) bool {
-	s = strings.ToLower(s)
-	switch s {
-	case "group_left", "group_right":
-		return true
-	default:
-		return false
-	}
-}
-
-func isBinaryOpBoolModifier(s string) bool {
-	s = strings.ToLower(s)
-	return s == "bool"
-}
-
-func isBinaryOpCmp(op string) bool {
-	switch op {
-	case "==", "!=", ">", "<", ">=", "<=":
-		return true
-	default:
-		return false
-	}
-}
-
-func isBinaryOpLogicalSet(op string) bool {
-	op = strings.ToLower(op)
-	switch op {
-	case "and", "or", "unless":
-		return true
-	default:
-		return false
-	}
-}
-
-func binaryOpConstants(op string, left, right float64, isBool bool) float64 {
-	if isBinaryOpCmp(op) {
-		evalCmp := func(cf func(left, right float64) bool) float64 {
-			if isBool {
-				if cf(left, right) {
-					return 1
-				}
-				return 0
-			}
-			if cf(left, right) {
-				return left
-			}
-			return nan
-		}
-		switch op {
-		case "==":
-			left = evalCmp(binaryOpEq)
-		case "!=":
-			left = evalCmp(binaryOpNeq)
-		case ">":
-			left = evalCmp(binaryOpGt)
-		case "<":
-			left = evalCmp(binaryOpLt)
-		case ">=":
-			left = evalCmp(binaryOpGte)
-		case "<=":
-			left = evalCmp(binaryOpLte)
-		default:
-			logger.Panicf("BUG: unexpected comparison binaryOp: %q", op)
-		}
-	} else {
-		switch op {
-		case "+":
-			left = binaryOpPlus(left, right)
-		case "-":
-			left = binaryOpMinus(left, right)
-		case "*":
-			left = binaryOpMul(left, right)
-		case "/":
-			left = binaryOpDiv(left, right)
-		case "%":
-			left = binaryOpMod(left, right)
-		case "^":
-			left = binaryOpPow(left, right)
-		case "and":
-			// Nothing to do
-		case "or":
-			// Nothing to do
-		case "unless":
-			left = nan
-		case "default":
-			left = binaryOpDefault(left, right)
-		case "if":
-			left = binaryOpIf(left, right)
-		case "ifnot":
-			left = binaryOpIfnot(left, right)
-		default:
-			logger.Panicf("BUG: unexpected non-comparison binaryOp: %q", op)
-		}
-	}
-	return left
-}
-
 type binaryOpFuncArg struct {
-	be    *binaryOpExpr
+	be    *metricsql.BinaryOpExpr
 	left  []*timeseries
 	right []*timeseries
 }
@@ -260,18 +97,21 @@ func newBinaryOpFunc(bf func(left, right float64, isBool bool) float64) binaryOp
 				dstValues[j] = bf(a, b, isBool)
 			}
 		}
+		// Optimization: remove time series containing only NaNs.
+		// This is quite common after applying filters like `q > 0`.
+		dst = removeNaNs(dst)
 		return dst, nil
 	}
 }

-func adjustBinaryOpTags(be *binaryOpExpr, left, right []*timeseries) ([]*timeseries, []*timeseries, []*timeseries, error) {
+func adjustBinaryOpTags(be *metricsql.BinaryOpExpr, left, right []*timeseries) ([]*timeseries, []*timeseries, []*timeseries, error) {
 	if len(be.GroupModifier.Op) == 0 && len(be.JoinModifier.Op) == 0 {
 		if isScalar(left) {
 			// Fast path: `scalar op vector`
 			rvsLeft := make([]*timeseries, len(right))
 			tsLeft := left[0]
 			for i, tsRight := range right {
-				tsRight.MetricName.ResetMetricGroup()
+				resetMetricGroupIfRequired(be, tsRight)
 				rvsLeft[i] = tsLeft
 			}
 			return rvsLeft, right, right, nil
@@ -281,7 +121,7 @@ func adjustBinaryOpTags(be *binaryOpExpr, left, right []*timeseries) ([]*timeser
 			rvsRight := make([]*timeseries, len(left))
 			tsRight := right[0]
 			for i, tsLeft := range left {
-				tsLeft.MetricName.ResetMetricGroup()
+				resetMetricGroupIfRequired(be, tsLeft)
 				rvsRight[i] = tsRight
 			}
 			return left, rvsRight, left, nil
@@ -289,19 +129,14 @@ func adjustBinaryOpTags(be *binaryOpExpr, left, right []*timeseries) ([]*timeser
 	}

 	// Slow path: `vector op vector` or `a op {on|ignoring} {group_left|group_right} b`
-	ensureOneX := func(side string, tss []*timeseries) error {
-		if len(tss) == 0 {
-			logger.Panicf("BUG: tss must contain at least one value")
-		}
-		if len(tss) == 1 {
-			return nil
-		}
-		return fmt.Errorf(`duplicate timeseries on the %s side of %q: %s %s`, side, be.Op, stringMetricTags(&tss[0].MetricName), be.GroupModifier.AppendString(nil))
-	}
 	var rvsLeft, rvsRight []*timeseries
 	mLeft, mRight := createTimeseriesMapByTagSet(be, left, right)
 	joinOp := strings.ToLower(be.JoinModifier.Op)
-	joinTags := be.JoinModifier.Args
+	groupOp := strings.ToLower(be.GroupModifier.Op)
+	if len(groupOp) == 0 {
+		groupOp = "ignoring"
+	}
+	groupTags := be.GroupModifier.Args
 	for k, tssLeft := range mLeft {
 		tssRight := mRight[k]
 		if len(tssRight) == 0 {
@@ -309,37 +144,38 @@ func adjustBinaryOpTags(be *binaryOpExpr, left, right []*timeseries) ([]*timeser
 		}
 		switch joinOp {
 		case "group_left":
-			if err := ensureOneX("right", tssRight); err != nil {
+			var err error
+			rvsLeft, rvsRight, err = groupJoin("right", be, rvsLeft, rvsRight, tssLeft, tssRight)
+			if err != nil {
 				return nil, nil, nil, err
 			}
-			src := tssRight[0]
-			for _, ts := range tssLeft {
-				ts.MetricName.AddMissingTags(joinTags, &src.MetricName)
-				rvsLeft = append(rvsLeft, ts)
-				rvsRight = append(rvsRight, src)
-			}
 		case "group_right":
-			if err := ensureOneX("left", tssLeft); err != nil {
+			var err error
+			rvsRight, rvsLeft, err = groupJoin("left", be, rvsRight, rvsLeft, tssRight, tssLeft)
+			if err != nil {
 				return nil, nil, nil, err
 			}
-			src := tssLeft[0]
-			for _, ts := range tssRight {
-				ts.MetricName.AddMissingTags(joinTags, &src.MetricName)
-				rvsLeft = append(rvsLeft, src)
-				rvsRight = append(rvsRight, ts)
-			}
 		case "":
-			if err := ensureOneX("left", tssLeft); err != nil {
+			if err := ensureSingleTimeseries("left", be, tssLeft); err != nil {
 				return nil, nil, nil, err
 			}
-			if err := ensureOneX("right", tssRight); err != nil {
+			if err := ensureSingleTimeseries("right", be, tssRight); err != nil {
 				return nil, nil, nil, err
 			}
-			tssLeft[0].MetricName.ResetMetricGroup()
-			rvsLeft = append(rvsLeft, tssLeft[0])
+			tsLeft := tssLeft[0]
+			resetMetricGroupIfRequired(be, tsLeft)
+			switch groupOp {
+			case "on":
+				tsLeft.MetricName.RemoveTagsOn(groupTags)
+			case "ignoring":
+				tsLeft.MetricName.RemoveTagsIgnoring(groupTags)
+			default:
+				logger.Panicf("BUG: unexpected binary op modifier %q", groupOp)
+			}
+			rvsLeft = append(rvsLeft, tsLeft)
 			rvsRight = append(rvsRight, tssRight[0])
 		default:
-			return nil, nil, nil, fmt.Errorf(`unexpected join modifier %q`, joinOp)
+			logger.Panicf("BUG: unexpected join modifier %q", joinOp)
 		}
 	}
 	dst := rvsLeft
@@ -349,82 +185,121 @@ func adjustBinaryOpTags(be *binaryOpExpr, left, right []*timeseries) ([]*timeser
 	return rvsLeft, rvsRight, dst, nil
 }

-func binaryOpPlus(left, right float64) float64 {
-	return left + right
-}
-
-func binaryOpMinus(left, right float64) float64 {
-	return left - right
-}
-
-func binaryOpMul(left, right float64) float64 {
-	return left * right
-}
-
-func binaryOpDiv(left, right float64) float64 {
-	return left / right
-}
-
-func binaryOpMod(left, right float64) float64 {
-	return math.Mod(left, right)
-}
-
-func binaryOpPow(left, right float64) float64 {
-	return math.Pow(left, right)
-}
-
-func binaryOpDefault(left, right float64) float64 {
-	if math.IsNaN(left) {
-		return right
+func ensureSingleTimeseries(side string, be *metricsql.BinaryOpExpr, tss []*timeseries) error {
+	if len(tss) == 0 {
+		logger.Panicf("BUG: tss must contain at least one value")
 	}
-	return left
-}
-
-func binaryOpIf(left, right float64) float64 {
-	if math.IsNaN(right) {
-		return nan
+	for len(tss) > 1 {
+		if !mergeNonOverlappingTimeseries(tss[0], tss[len(tss)-1]) {
+			return fmt.Errorf(`duplicate time series on the %s side of %s %s: %s and %s`, side, be.Op, be.GroupModifier.AppendString(nil),
+				stringMetricTags(&tss[0].MetricName), stringMetricTags(&tss[len(tss)-1].MetricName))
+		}
+		tss = tss[:len(tss)-1]
 	}
-	return left
+	return nil
 }

-func binaryOpIfnot(left, right float64) float64 {
-	if math.IsNaN(right) {
-		return left
+func groupJoin(singleTimeseriesSide string, be *metricsql.BinaryOpExpr, rvsLeft, rvsRight, tssLeft, tssRight []*timeseries) ([]*timeseries, []*timeseries, error) {
+	joinTags := be.JoinModifier.Args
+	var m map[string]*timeseries
+	for _, tsLeft := range tssLeft {
+		resetMetricGroupIfRequired(be, tsLeft)
+		if len(tssRight) == 1 {
+			// Easy case - right part contains only a single matching time series.
+			tsLeft.MetricName.AddMissingTags(joinTags, &tssRight[0].MetricName)
+			rvsLeft = append(rvsLeft, tsLeft)
+			rvsRight = append(rvsRight, tssRight[0])
+			continue
+		}
+
+		// Hard case - right part contains multiple matching time series.
+		// Verify it doesn't result in duplicate MetricName values after adding missing tags.
+		if m == nil {
+			m = make(map[string]*timeseries, len(tssRight))
+		} else {
+			for k := range m {
+				delete(m, k)
+			}
+		}
+		bb := bbPool.Get()
+		for _, tsRight := range tssRight {
+			var tsCopy timeseries
+			tsCopy.CopyFromShallowTimestamps(tsLeft)
+			tsCopy.MetricName.AddMissingTags(joinTags, &tsRight.MetricName)
+			bb.B = marshalMetricTagsSorted(bb.B[:0], &tsCopy.MetricName)
+			if tsExisting := m[string(bb.B)]; tsExisting != nil {
+				// Try merging tsExisting with tsRight if they don't overlap.
+				if mergeNonOverlappingTimeseries(tsExisting, tsRight) {
+					continue
+				}
+				return nil, nil, fmt.Errorf("duplicate time series on the %s side of `%s %s %s`: %s and %s",
+					singleTimeseriesSide, be.Op, be.GroupModifier.AppendString(nil), be.JoinModifier.AppendString(nil),
+					stringMetricTags(&tsExisting.MetricName), stringMetricTags(&tsRight.MetricName))
+			}
+			m[string(bb.B)] = tsRight
+			rvsLeft = append(rvsLeft, &tsCopy)
+			rvsRight = append(rvsRight, tsRight)
+		}
+		bbPool.Put(bb)
 	}
-	return nan
+	return rvsLeft, rvsRight, nil
 }

-func binaryOpEq(left, right float64) bool {
-	return left == right
+func mergeNonOverlappingTimeseries(dst, src *timeseries) bool {
+	// Verify whether the time series can be merged.
+	srcValues := src.Values
+	dstValues := dst.Values
+	_ = dstValues[len(srcValues)-1]
+	for i, v := range srcValues {
+		if math.IsNaN(v) {
+			continue
+		}
+		if !math.IsNaN(dstValues[i]) {
+			return false
+		}
+	}
+
+	// Time series can be merged. Merge them.
+	for i, v := range srcValues {
+		if math.IsNaN(v) {
+			continue
+		}
+		dstValues[i] = v
+	}
+	return true
 }

-func binaryOpNeq(left, right float64) bool {
-	return left != right
-}
-
-func binaryOpGt(left, right float64) bool {
-	return left > right
-}
-
-func binaryOpLt(left, right float64) bool {
-	return left < right
-}
-
-func binaryOpGte(left, right float64) bool {
-	return left >= right
-}
-
-func binaryOpLte(left, right float64) bool {
-	return left <= right
+func resetMetricGroupIfRequired(be *metricsql.BinaryOpExpr, ts *timeseries) {
+	if metricsql.IsBinaryOpCmp(be.Op) && !be.Bool {
+		// Do not reset MetricGroup for non-boolean `compare` binary ops like Prometheus does.
+		return
+	}
+	switch be.Op {
+	case "default", "if", "ifnot":
+		// Do not reset MetricGroup for these ops.
+		return
+	}
+	ts.MetricName.ResetMetricGroup()
 }

 func binaryOpAnd(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 	mLeft, mRight := createTimeseriesMapByTagSet(bfa.be, bfa.left, bfa.right)
 	var rvs []*timeseries
-	for k := range mRight {
-		if tss := mLeft[k]; tss != nil {
-			rvs = append(rvs, tss...)
+	for k, tssRight := range mRight {
+		tssLeft := mLeft[k]
+		if tssLeft == nil {
+			continue
 		}
+		for i := range tssLeft[0].Values {
+			if !isAllNaNs(tssRight, i) {
+				continue
+			}
+			for _, tsLeft := range tssLeft {
+				tsLeft.Values[i] = nan
+			}
+		}
+		tssLeft = removeNaNs(tssLeft)
+		rvs = append(rvs, tssLeft...)
 	}
 	return rvs, nil
 }
@@ -446,15 +321,36 @@ func binaryOpOr(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 func binaryOpUnless(bfa *binaryOpFuncArg) ([]*timeseries, error) {
 	mLeft, mRight := createTimeseriesMapByTagSet(bfa.be, bfa.left, bfa.right)
 	var rvs []*timeseries
-	for k, tss := range mLeft {
-		if mRight[k] == nil {
-			rvs = append(rvs, tss...)
+	for k, tssLeft := range mLeft {
+		tssRight := mRight[k]
+		if tssRight == nil {
+			rvs = append(rvs, tssLeft...)
+			continue
 		}
+		for i := range tssLeft[0].Values {
+			if isAllNaNs(tssRight, i) {
+				continue
+			}
+			for _, tsLeft := range tssLeft {
+				tsLeft.Values[i] = nan
+			}
+		}
+		tssLeft = removeNaNs(tssLeft)
+		rvs = append(rvs, tssLeft...)
 	}
 	return rvs, nil
 }

-func createTimeseriesMapByTagSet(be *binaryOpExpr, left, right []*timeseries) (map[string][]*timeseries, map[string][]*timeseries) {
+func isAllNaNs(tss []*timeseries, idx int) bool {
+	for _, ts := range tss {
+		if !math.IsNaN(ts.Values[idx]) {
+			return false
+		}
+	}
+	return true
+}
+
+func createTimeseriesMapByTagSet(be *metricsql.BinaryOpExpr, left, right []*timeseries) (map[string][]*timeseries, map[string][]*timeseries) {
 	groupTags := be.GroupModifier.Args
 	groupOp := strings.ToLower(be.GroupModifier.Op)
 	if len(groupOp) == 0 {
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@@ -8,18 +8,20 @@ import (
 	"sync"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 	"github.com/VictoriaMetrics/metrics"
 )

 var (
-	maxPointsPerTimeseries = flag.Int("search.maxPointsPerTimeseries", 10e3, "The maximum points per a single timeseries returned from the search")
+	maxPointsPerTimeseries = flag.Int("search.maxPointsPerTimeseries", 30e3, "The maximum points per a single timeseries returned from the search")
 )

-// The minumum number of points per timeseries for enabling time rounding.
+// The minimum number of points per timeseries for enabling time rounding.
 // This improves cache hit ratio for frequently requested queries over
 // big time ranges.
 const minTimeseriesPointsForTimeRounding = 50
@@ -31,7 +33,7 @@ const minTimeseriesPointsForTimeRounding = 50
 func ValidateMaxPointsPerTimeseries(start, end, step int64) error {
 	points := (end-start)/step + 1
 	if uint64(points) > uint64(*maxPointsPerTimeseries) {
-		return fmt.Errorf(`too many points for the given step=%d, start=%d and end=%d: %d; cannot exceed %d points`,
+		return fmt.Errorf(`too many points for the given step=%d, start=%d and end=%d: %d; cannot exceed -search.maxPointsPerTimeseries=%d`,
 			step, start, end, uint64(points), *maxPointsPerTimeseries)
 	}
 	return nil
@@ -57,19 +59,33 @@ func AdjustStartEnd(start, end, step int64) (int64, int64) {
 	if adjust > 0 {
 		end += step - adjust
 	}
+
+	// Make sure that the new number of points is the same as the initial number of points.
+	newPoints := (end-start)/step + 1
+	for newPoints > points {
+		end -= step
+		newPoints--
+	}
+
 	return start, end
 }

 // EvalConfig is the configuration required for query evaluation via Exec
 type EvalConfig struct {
-	Start int64
-	End   int64
-	Step  int64
+	AuthToken *auth.Token
+	Start     int64
+	End       int64
+	Step      int64

 	Deadline netstorage.Deadline

 	MayCache bool

+	// LookbackDelta is analog to `-query.lookback-delta` from Prometheus.
+	LookbackDelta int64
+
+	DenyPartialResponse bool
+
 	timestamps     []int64
 	timestampsOnce sync.Once
 }
@@ -77,11 +93,14 @@ type EvalConfig struct {
 // newEvalConfig returns new EvalConfig copy from src.
 func newEvalConfig(src *EvalConfig) *EvalConfig {
 	var ec EvalConfig
+	ec.AuthToken = src.AuthToken
 	ec.Start = src.Start
 	ec.End = src.End
 	ec.Step = src.Step
 	ec.Deadline = src.Deadline
 	ec.MayCache = src.MayCache
+	ec.LookbackDelta = src.LookbackDelta
+	ec.DenyPartialResponse = src.DenyPartialResponse

 	// do not copy src.timestamps - they must be generated again.
 	return &ec
@@ -140,25 +159,25 @@ func getTimestamps(start, end, step int64) []int64 {
 	return timestamps
 }

-func evalExpr(ec *EvalConfig, e expr) ([]*timeseries, error) {
-	if me, ok := e.(*metricExpr); ok {
-		re := &rollupExpr{
+func evalExpr(ec *EvalConfig, e metricsql.Expr) ([]*timeseries, error) {
+	if me, ok := e.(*metricsql.MetricExpr); ok {
+		re := &metricsql.RollupExpr{
 			Expr: me,
 		}
-		rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, re)
+		rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
 		if err != nil {
 			return nil, fmt.Errorf(`cannot evaluate %q: %s`, me.AppendString(nil), err)
 		}
 		return rv, nil
 	}
-	if re, ok := e.(*rollupExpr); ok {
-		rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, re)
+	if re, ok := e.(*metricsql.RollupExpr); ok {
+		rv, err := evalRollupFunc(ec, "default_rollup", rollupDefault, e, re, nil)
 		if err != nil {
 			return nil, fmt.Errorf(`cannot evaluate %q: %s`, re.AppendString(nil), err)
 		}
 		return rv, nil
 	}
-	if fe, ok := e.(*funcExpr); ok {
+	if fe, ok := e.(*metricsql.FuncExpr); ok {
 		nrf := getRollupFunc(fe.Name)
 		if nrf == nil {
 			args, err := evalExprs(ec, fe.Args)
@@ -188,13 +207,30 @@ func evalExpr(ec *EvalConfig, e expr) ([]*timeseries, error) {
 		if err != nil {
 			return nil, err
 		}
-		rv, err := evalRollupFunc(ec, fe.Name, rf, re)
+		rv, err := evalRollupFunc(ec, fe.Name, rf, e, re, nil)
 		if err != nil {
 			return nil, fmt.Errorf(`cannot evaluate %q: %s`, fe.AppendString(nil), err)
 		}
 		return rv, nil
 	}
-	if ae, ok := e.(*aggrFuncExpr); ok {
+	if ae, ok := e.(*metricsql.AggrFuncExpr); ok {
+		if callbacks := getIncrementalAggrFuncCallbacks(ae.Name); callbacks != nil {
+			fe, nrf := tryGetArgRollupFuncWithMetricExpr(ae)
+			if fe != nil {
+				// There is an optimized path for calculating metricsql.AggrFuncExpr over rollupFunc over metricsql.MetricExpr.
+				// The optimized path saves RAM for aggregates over big number of time series.
+				args, re, err := evalRollupFuncArgs(ec, fe)
+				if err != nil {
+					return nil, err
+				}
+				rf, err := nrf(args)
+				if err != nil {
+					return nil, err
+				}
+				iafc := newIncrementalAggrFuncContext(ae, callbacks)
+				return evalRollupFunc(ec, fe.Name, rf, e, re, iafc)
+			}
+		}
 		args, err := evalExprs(ec, ae.Args)
 		if err != nil {
 			return nil, err
@@ -214,7 +250,7 @@ func evalExpr(ec *EvalConfig, e expr) ([]*timeseries, error) {
 		}
 		return rv, nil
 	}
-	if be, ok := e.(*binaryOpExpr); ok {
+	if be, ok := e.(*metricsql.BinaryOpExpr); ok {
 		left, err := evalExpr(ec, be.Left)
 		if err != nil {
 			return nil, err
@@ -238,18 +274,87 @@ func evalExpr(ec *EvalConfig, e expr) ([]*timeseries, error) {
 		}
 		return rv, nil
 	}
-	if ne, ok := e.(*numberExpr); ok {
+	if ne, ok := e.(*metricsql.NumberExpr); ok {
 		rv := evalNumber(ec, ne.N)
 		return rv, nil
 	}
-	if se, ok := e.(*stringExpr); ok {
+	if se, ok := e.(*metricsql.StringExpr); ok {
 		rv := evalString(ec, se.S)
 		return rv, nil
 	}
 	return nil, fmt.Errorf("unexpected expression %q", e.AppendString(nil))
 }

-func evalExprs(ec *EvalConfig, es []expr) ([][]*timeseries, error) {
+func tryGetArgRollupFuncWithMetricExpr(ae *metricsql.AggrFuncExpr) (*metricsql.FuncExpr, newRollupFunc) {
+	if len(ae.Args) != 1 {
+		return nil, nil
+	}
+	e := ae.Args[0]
+	// Make sure e contains one of the following:
+	// - metricExpr
+	// - metricExpr[d]
+	// - rollupFunc(metricExpr)
+	// - rollupFunc(metricExpr[d])
+
+	if me, ok := e.(*metricsql.MetricExpr); ok {
+		// e = metricExpr
+		if me.IsEmpty() {
+			return nil, nil
+		}
+		fe := &metricsql.FuncExpr{
+			Name: "default_rollup",
+			Args: []metricsql.Expr{me},
+		}
+		nrf := getRollupFunc(fe.Name)
+		return fe, nrf
+	}
+	if re, ok := e.(*metricsql.RollupExpr); ok {
+		if me, ok := re.Expr.(*metricsql.MetricExpr); !ok || me.IsEmpty() || re.ForSubquery() {
+			return nil, nil
+		}
+		// e = metricExpr[d]
+		fe := &metricsql.FuncExpr{
+			Name: "default_rollup",
+			Args: []metricsql.Expr{re},
+		}
+		nrf := getRollupFunc(fe.Name)
+		return fe, nrf
+	}
+	fe, ok := e.(*metricsql.FuncExpr)
+	if !ok {
+		return nil, nil
+	}
+	nrf := getRollupFunc(fe.Name)
+	if nrf == nil {
+		return nil, nil
+	}
+	rollupArgIdx := getRollupArgIdx(fe.Name)
+	if rollupArgIdx >= len(fe.Args) {
+		// Incorrect number of args for rollup func.
+		return nil, nil
+	}
+	arg := fe.Args[rollupArgIdx]
+	if me, ok := arg.(*metricsql.MetricExpr); ok {
+		if me.IsEmpty() {
+			return nil, nil
+		}
+		// e = rollupFunc(metricExpr)
+		return &metricsql.FuncExpr{
+			Name: fe.Name,
+			Args: []metricsql.Expr{me},
+		}, nrf
+	}
+	if re, ok := arg.(*metricsql.RollupExpr); ok {
+		if me, ok := re.Expr.(*metricsql.MetricExpr); !ok || me.IsEmpty() || re.ForSubquery() {
+			return nil, nil
+		}
+		// e = rollupFunc(metricExpr[d])
+		return fe, nrf
+	}
+	return nil, nil
+}
+
+func evalExprs(ec *EvalConfig, es []metricsql.Expr) ([][]*timeseries, error) {
 	var rvs [][]*timeseries
 	for _, e := range es {
 		rv, err := evalExpr(ec, e)
@@ -261,9 +366,12 @@ func evalExprs(ec *EvalConfig, es []expr) ([][]*timeseries, error) {
 	return rvs, nil
 }

-func evalRollupFuncArgs(ec *EvalConfig, fe *funcExpr) ([]interface{}, *rollupExpr, error) {
-	var re *rollupExpr
+func evalRollupFuncArgs(ec *EvalConfig, fe *metricsql.FuncExpr) ([]interface{}, *metricsql.RollupExpr, error) {
+	var re *metricsql.RollupExpr
 	rollupArgIdx := getRollupArgIdx(fe.Name)
+	if len(fe.Args) <= rollupArgIdx {
+		return nil, nil, fmt.Errorf("expecting at least %d args to %q; got %d args; expr: %q", rollupArgIdx+1, fe.Name, len(fe.Args), fe.AppendString(nil))
+	}
 	args := make([]interface{}, len(fe.Args))
 	for i, arg := range fe.Args {
 		if i == rollupArgIdx {
@@ -280,65 +388,72 @@ func evalRollupFuncArgs(ec *EvalConfig, fe *funcExpr) ([]interface{}, *rollupExp
 	return args, re, nil
 }

-func getRollupExprArg(arg expr) *rollupExpr {
-	re, ok := arg.(*rollupExpr)
+func getRollupExprArg(arg metricsql.Expr) *metricsql.RollupExpr {
+	re, ok := arg.(*metricsql.RollupExpr)
 	if !ok {
-		// Wrap non-rollup arg into rollupExpr.
-		return &rollupExpr{
+		// Wrap non-rollup arg into metricsql.RollupExpr.
+		return &metricsql.RollupExpr{
 			Expr: arg,
 		}
 	}
-	if len(re.Step) == 0 && !re.InheritStep {
-		// Return standard rollup if it doesn't set step.
+	if !re.ForSubquery() {
+		// Return standard rollup if it doesn't contain subquery.
 		return re
 	}
-	me, ok := re.Expr.(*metricExpr)
+	me, ok := re.Expr.(*metricsql.MetricExpr)
 	if !ok {
 		// arg contains subquery.
 		return re
 	}
 	// Convert me[w:step] -> default_rollup(me)[w:step]
 	reNew := *re
-	reNew.Expr = &funcExpr{
+	reNew.Expr = &metricsql.FuncExpr{
 		Name: "default_rollup",
-		Args: []expr{
-			&rollupExpr{Expr: me},
+		Args: []metricsql.Expr{
+			&metricsql.RollupExpr{Expr: me},
 		},
 	}
 	return &reNew
 }

-func evalRollupFunc(ec *EvalConfig, name string, rf rollupFunc, re *rollupExpr) ([]*timeseries, error) {
+func evalRollupFunc(ec *EvalConfig, name string, rf rollupFunc, expr metricsql.Expr, re *metricsql.RollupExpr, iafc *incrementalAggrFuncContext) ([]*timeseries, error) {
 	ecNew := ec
 	var offset int64
 	if len(re.Offset) > 0 {
 		var err error
-		offset, err = DurationValue(re.Offset, ec.Step)
+		offset, err = metricsql.DurationValue(re.Offset, ec.Step)
 		if err != nil {
 			return nil, err
 		}
-		ecNew = newEvalConfig(ec)
+		ecNew = newEvalConfig(ecNew)
 		ecNew.Start -= offset
 		ecNew.End -= offset
-		ecNew.Start, ecNew.End = AdjustStartEnd(ecNew.Start, ecNew.End, ecNew.Step)
+		if ecNew.MayCache {
+			start, end := AdjustStartEnd(ecNew.Start, ecNew.End, ecNew.Step)
+			offset += ecNew.Start - start
+			ecNew.Start = start
+			ecNew.End = end
+		}
+	}
+	if name == "rollup_candlestick" {
+		// Automatically apply `offset -step` to `rollup_candlestick` function
+		// in order to obtain expected OHLC results.
+		// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/309#issuecomment-582113462
+		step := ecNew.Step
+		ecNew = newEvalConfig(ecNew)
+		ecNew.Start += step
+		ecNew.End += step
+		offset -= step
 	}
 	var rvs []*timeseries
 	var err error
-	if me, ok := re.Expr.(*metricExpr); ok {
-		if me.IsEmpty() {
-			rvs = evalNumber(ecNew, nan)
-		} else {
-			var window int64
-			if len(re.Window) > 0 {
-				window, err = DurationValue(re.Window, ec.Step)
-				if err != nil {
-					return nil, err
-				}
-			}
-			rvs, err = evalRollupFuncWithMetricExpr(ecNew, name, rf, me, window)
-		}
+	if me, ok := re.Expr.(*metricsql.MetricExpr); ok {
+		rvs, err = evalRollupFuncWithMetricExpr(ecNew, name, rf, expr, me, iafc, re.Window)
 	} else {
-		rvs, err = evalRollupFuncWithSubquery(ecNew, name, rf, re)
+		if iafc != nil {
+			logger.Panicf("BUG: iafc must be nil for rollup %q over subquery %q", name, re.AppendString(nil))
+		}
+		rvs, err = evalRollupFuncWithSubquery(ecNew, name, rf, expr, re)
 	}
 	if err != nil {
 		return nil, err
@@ -357,12 +472,12 @@ func evalRollupFunc(ec *EvalConfig, name string, rf rollupFunc, re *rollupExpr)
 	return rvs, nil
 }

-func evalRollupFuncWithSubquery(ec *EvalConfig, name string, rf rollupFunc, re *rollupExpr) ([]*timeseries, error) {
-	// Do not use rollupResultCacheV here, since it works only with metricExpr.
+func evalRollupFuncWithSubquery(ec *EvalConfig, name string, rf rollupFunc, expr metricsql.Expr, re *metricsql.RollupExpr) ([]*timeseries, error) {
+	// TODO: determine whether to use rollupResultCacheV here.
 	var step int64
 	if len(re.Step) > 0 {
 		var err error
-		step, err = DurationValue(re.Step, ec.Step)
+		step, err = metricsql.PositiveDurationValue(re.Step, ec.Step)
 		if err != nil {
 			return nil, err
 		}
@@ -372,15 +487,14 @@ func evalRollupFuncWithSubquery(ec *EvalConfig, name string, rf rollupFunc, re *
 	var window int64
 	if len(re.Window) > 0 {
 		var err error
-		window, err = DurationValue(re.Window, ec.Step)
+		window, err = metricsql.PositiveDurationValue(re.Window, ec.Step)
 		if err != nil {
 			return nil, err
 		}
 	}

 	ecSQ := newEvalConfig(ec)
-	ecSQ.Start -= window + maxSilenceInterval
-	ecSQ.End += step
+	ecSQ.Start -= window + maxSilenceInterval + step
 	ecSQ.Step = step
 	if err := ValidateMaxPointsPerTimeseries(ecSQ.Start, ecSQ.End, ecSQ.Step); err != nil {
 		return nil, err
@@ -390,35 +504,41 @@ func evalRollupFuncWithSubquery(ec *EvalConfig, name string, rf rollupFunc, re *
 	if err != nil {
 		return nil, err
 	}
+	if len(tssSQ) == 0 {
+		if name == "absent_over_time" {
+			tss := evalNumber(ec, 1)
+			return tss, nil
+		}
+		return nil, nil
+	}

 	sharedTimestamps := getTimestamps(ec.Start, ec.End, ec.Step)
-	preFunc, rcs := getRollupConfigs(name, rf, ec.Start, ec.End, ec.Step, window, sharedTimestamps)
+	preFunc, rcs, err := getRollupConfigs(name, rf, expr, ec.Start, ec.End, ec.Step, window, ec.LookbackDelta, sharedTimestamps)
+	if err != nil {
+		return nil, err
+	}
 	tss := make([]*timeseries, 0, len(tssSQ)*len(rcs))
 	var tssLock sync.Mutex
+	removeMetricGroup := !rollupFuncsKeepMetricGroup[name]
 	doParallel(tssSQ, func(tsSQ *timeseries, values []float64, timestamps []int64) ([]float64, []int64) {
 		values, timestamps = removeNanValues(values[:0], timestamps[:0], tsSQ.Values, tsSQ.Timestamps)
 		preFunc(values, timestamps)
 		for _, rc := range rcs {
-			var ts timeseries
-			ts.MetricName.CopyFrom(&tsSQ.MetricName)
-			if len(rc.TagValue) > 0 {
-				ts.MetricName.AddTag("rollup", rc.TagValue)
+			if tsm := newTimeseriesMap(name, sharedTimestamps, &tsSQ.MetricName); tsm != nil {
+				rc.DoTimeseriesMap(tsm, values, timestamps)
+				tssLock.Lock()
+				tss = tsm.AppendTimeseriesTo(tss)
+				tssLock.Unlock()
+				continue
 			}
-			ts.Values = rc.Do(ts.Values[:0], values, timestamps)
-			ts.Timestamps = sharedTimestamps
-			ts.denyReuse = true
+			var ts timeseries
+			doRollupForTimeseries(rc, &ts, &tsSQ.MetricName, values, timestamps, sharedTimestamps, removeMetricGroup)
 			tssLock.Lock()
 			tss = append(tss, &ts)
 			tssLock.Unlock()
 		}
 		return values, timestamps
 	})
-	if !rollupFuncsKeepMetricGroup[name] {
-		tss = copyTimeseriesMetricNames(tss)
-		for _, ts := range tss {
-			ts.MetricName.ResetMetricGroup()
-		}
-	}
 	return tss, nil
 }

@@ -472,31 +592,28 @@ func removeNanValues(dstValues []float64, dstTimestamps []int64, values []float6
 	return dstValues, dstTimestamps
 }

-func getMaxPointsPerRollup() int {
-	maxPointsPerRollupOnce.Do(func() {
-		n := memory.Allowed() / 16 / 8
-		if n <= 16 {
-			n = 16
-		}
-		maxPointsPerRollup = n
-	})
-	return maxPointsPerRollup
-}
-
-var (
-	maxPointsPerRollup     int
-	maxPointsPerRollupOnce sync.Once
-)
-
 var (
 	rollupResultCacheFullHits    = metrics.NewCounter(`vm_rollup_result_cache_full_hits_total`)
 	rollupResultCachePartialHits = metrics.NewCounter(`vm_rollup_result_cache_partial_hits_total`)
 	rollupResultCacheMiss        = metrics.NewCounter(`vm_rollup_result_cache_miss_total`)
 )

-func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc, me *metricExpr, window int64) ([]*timeseries, error) {
+func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc,
+	expr metricsql.Expr, me *metricsql.MetricExpr, iafc *incrementalAggrFuncContext, windowStr string) ([]*timeseries, error) {
+	if me.IsEmpty() {
+		return evalNumber(ec, nan), nil
+	}
+	var window int64
+	if len(windowStr) > 0 {
+		var err error
+		window, err = metricsql.PositiveDurationValue(windowStr, ec.Step)
+		if err != nil {
+			return nil, err
+		}
+	}
+
 	// Search for partial results in cache.
-	tssCached, start := rollupResultCacheV.Get(name, ec, me, window)
+	tssCached, start := rollupResultCacheV.Get(ec, expr, window)
 	if start > ec.End {
 		// The result is fully cached.
 		rollupResultCacheFullHits.Inc()
@@ -508,53 +625,152 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc, me
 		rollupResultCacheMiss.Inc()
 	}

-	// Fetch the remaining part of the result.
-	sq := &storage.SearchQuery{
-		MinTimestamp: start - window - maxSilenceInterval,
-		MaxTimestamp: ec.End + ec.Step,
-		TagFilterss:  [][]storage.TagFilter{me.TagFilters},
-	}
-	rss, err := netstorage.ProcessSearchQuery(sq, ec.Deadline)
+	// Obtain rollup configs before fetching data from db,
+	// so type errors can be caught earlier.
+	sharedTimestamps := getTimestamps(start, ec.End, ec.Step)
+	preFunc, rcs, err := getRollupConfigs(name, rf, expr, start, ec.End, ec.Step, window, ec.LookbackDelta, sharedTimestamps)
 	if err != nil {
 		return nil, err
 	}
+
+	// Fetch the remaining part of the result.
+	tfs := toTagFilters(me.LabelFilters)
+	minTimestamp := start - maxSilenceInterval
+	if window > ec.Step {
+		minTimestamp -= window
+	} else {
+		minTimestamp -= ec.Step
+	}
+	sq := &storage.SearchQuery{
+		AccountID:    ec.AuthToken.AccountID,
+		ProjectID:    ec.AuthToken.ProjectID,
+		MinTimestamp: minTimestamp,
+		MaxTimestamp: ec.End,
+		TagFilterss:  [][]storage.TagFilter{tfs},
+	}
+	rss, isPartial, err := netstorage.ProcessSearchQuery(ec.AuthToken, sq, true, ec.Deadline)
+	if err != nil {
+		return nil, err
+	}
+	if isPartial && ec.DenyPartialResponse {
+		return nil, fmt.Errorf("cannot return full response, since some of vmstorage nodes are unavailable")
+	}
 	rssLen := rss.Len()
 	if rssLen == 0 {
 		rss.Cancel()
+		var tss []*timeseries
+		if name == "absent_over_time" {
+			tss = getAbsentTimeseries(ec, me)
+		}
 		// Add missing points until ec.End.
 		// Do not cache the result, since missing points
 		// may be backfilled in the future.
-		tss := mergeTimeseries(tssCached, nil, start, ec)
+		tss = mergeTimeseries(tssCached, tss, start, ec)
 		return tss, nil
 	}
-	sharedTimestamps := getTimestamps(start, ec.End, ec.Step)
-	preFunc, rcs := getRollupConfigs(name, rf, start, ec.End, ec.Step, window, sharedTimestamps)

 	// Verify timeseries fit available memory after the rollup.
 	// Take into account points from tssCached.
 	pointsPerTimeseries := 1 + (ec.End-ec.Start)/ec.Step
-	if uint64(pointsPerTimeseries) > uint64(getMaxPointsPerRollup()/rssLen/len(rcs)) {
-		rss.Cancel()
-		return nil, fmt.Errorf("cannot process more than %d data points for %d time series with %d points in each time series; "+
-			"possible solutions are: reducing the number of matching time series; switching to node with more RAM; increasing `step` query arg (%gs)",
-			getMaxPointsPerRollup(), rssLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
+	timeseriesLen := rssLen
+	if iafc != nil {
+		// Incremental aggregates require hold only GOMAXPROCS timeseries in memory.
+		timeseriesLen = runtime.GOMAXPROCS(-1)
+		if iafc.ae.Modifier.Op != "" {
+			// Increase the number of timeseries for non-empty group list: `aggr() by (something)`,
+			// since each group can have own set of time series in memory.
+			// Estimate the number of such groups is lower than 1000 :)
+			timeseriesLen *= 1000
+		}
 	}
+	rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
+	rollupMemorySize := mulNoOverflow(rollupPoints, 16)
+	rml := getRollupMemoryLimiter()
+	if !rml.Get(uint64(rollupMemorySize)) {
+		rss.Cancel()
+		return nil, fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series; "+
+			"possible solutions are: reducing the number of matching time series; switching to node with more RAM; "+
+			"increasing -memory.allowedPercent; increasing `step` query arg (%gs)",
+			rollupPoints, rssLen*len(rcs), pointsPerTimeseries, float64(ec.Step)/1e3)
+	}
+	defer rml.Put(uint64(rollupMemorySize))

 	// Evaluate rollup
-	tss := make([]*timeseries, 0, rssLen*len(rcs))
+	removeMetricGroup := !rollupFuncsKeepMetricGroup[name]
+	var tss []*timeseries
+	if iafc != nil {
+		tss, err = evalRollupWithIncrementalAggregate(name, iafc, rss, rcs, preFunc, sharedTimestamps, removeMetricGroup)
+	} else {
+		tss, err = evalRollupNoIncrementalAggregate(name, rss, rcs, preFunc, sharedTimestamps, removeMetricGroup)
+	}
+	if err != nil {
+		return nil, err
+	}
+	tss = mergeTimeseries(tssCached, tss, start, ec)
+	if !isPartial {
+		rollupResultCacheV.Put(ec, expr, window, tss)
+	}
+	return tss, nil
+}
+
+var (
+	rollupMemoryLimiter     memoryLimiter
+	rollupMemoryLimiterOnce sync.Once
+)
+
+func getRollupMemoryLimiter() *memoryLimiter {
+	rollupMemoryLimiterOnce.Do(func() {
+		rollupMemoryLimiter.MaxSize = uint64(memory.Allowed()) / 4
+	})
+	return &rollupMemoryLimiter
+}
+
+func evalRollupWithIncrementalAggregate(name string, iafc *incrementalAggrFuncContext, rss *netstorage.Results, rcs []*rollupConfig,
+	preFunc func(values []float64, timestamps []int64), sharedTimestamps []int64, removeMetricGroup bool) ([]*timeseries, error) {
+	err := rss.RunParallel(func(rs *netstorage.Result, workerID uint) {
+		preFunc(rs.Values, rs.Timestamps)
+		ts := getTimeseries()
+		defer putTimeseries(ts)
+		for _, rc := range rcs {
+			if tsm := newTimeseriesMap(name, sharedTimestamps, &rs.MetricName); tsm != nil {
+				rc.DoTimeseriesMap(tsm, rs.Values, rs.Timestamps)
+				for _, ts := range tsm.m {
+					iafc.updateTimeseries(ts, workerID)
+				}
+				continue
+			}
+			ts.Reset()
+			doRollupForTimeseries(rc, ts, &rs.MetricName, rs.Values, rs.Timestamps, sharedTimestamps, removeMetricGroup)
+			iafc.updateTimeseries(ts, workerID)
+
+			// ts.Timestamps points to sharedTimestamps. Zero it, so it can be re-used.
+			ts.Timestamps = nil
+			ts.denyReuse = false
+		}
+	})
+	if err != nil {
+		return nil, err
+	}
+	tss := iafc.finalizeTimeseries()
+	return tss, nil
+}
+
+func evalRollupNoIncrementalAggregate(name string, rss *netstorage.Results, rcs []*rollupConfig,
+	preFunc func(values []float64, timestamps []int64), sharedTimestamps []int64, removeMetricGroup bool) ([]*timeseries, error) {
+	tss := make([]*timeseries, 0, rss.Len()*len(rcs))
 	var tssLock sync.Mutex
-	err = rss.RunParallel(func(rs *netstorage.Result) {
+	err := rss.RunParallel(func(rs *netstorage.Result, workerID uint) {
 		preFunc(rs.Values, rs.Timestamps)
 		for _, rc := range rcs {
-			var ts timeseries
-			ts.MetricName.CopyFrom(&rs.MetricName)
-			if len(rc.TagValue) > 0 {
-				ts.MetricName.AddTag("rollup", rc.TagValue)
+			if tsm := newTimeseriesMap(name, sharedTimestamps, &rs.MetricName); tsm != nil {
+				rc.DoTimeseriesMap(tsm, rs.Values, rs.Timestamps)
+				tssLock.Lock()
+				tss = tsm.AppendTimeseriesTo(tss)
+				tssLock.Unlock()
+				continue
 			}
-			ts.Values = rc.Do(ts.Values[:0], rs.Values, rs.Timestamps)
-			ts.Timestamps = sharedTimestamps
-			ts.denyReuse = true
-
+			var ts timeseries
+			doRollupForTimeseries(rc, &ts, &rs.MetricName, rs.Values, rs.Timestamps, sharedTimestamps, removeMetricGroup)
 			tssLock.Lock()
 			tss = append(tss, &ts)
 			tssLock.Unlock()
@@ -563,64 +779,21 @@ func evalRollupFuncWithMetricExpr(ec *EvalConfig, name string, rf rollupFunc, me
 	if err != nil {
 		return nil, err
 	}
-	if !rollupFuncsKeepMetricGroup[name] {
-		tss = copyTimeseriesMetricNames(tss)
-		for _, ts := range tss {
-			ts.MetricName.ResetMetricGroup()
-		}
-	}
-	tss = mergeTimeseries(tssCached, tss, start, ec)
-	rollupResultCacheV.Put(name, ec, me, window, tss)
-
 	return tss, nil
 }

-func getRollupConfigs(name string, rf rollupFunc, start, end, step, window int64, sharedTimestamps []int64) (func(values []float64, timestamps []int64), []*rollupConfig) {
-	preFunc := func(values []float64, timestamps []int64) {}
-	if rollupFuncsRemoveCounterResets[name] {
-		preFunc = func(values []float64, timestamps []int64) {
-			removeCounterResets(values)
-		}
+func doRollupForTimeseries(rc *rollupConfig, tsDst *timeseries, mnSrc *storage.MetricName, valuesSrc []float64, timestampsSrc []int64,
+	sharedTimestamps []int64, removeMetricGroup bool) {
+	tsDst.MetricName.CopyFrom(mnSrc)
+	if len(rc.TagValue) > 0 {
+		tsDst.MetricName.AddTag("rollup", rc.TagValue)
 	}
-	newRollupConfig := func(rf rollupFunc, tagValue string) *rollupConfig {
-		return &rollupConfig{
-			TagValue:   tagValue,
-			Func:       rf,
-			Start:      start,
-			End:        end,
-			Step:       step,
-			Window:     window,
-			Timestamps: sharedTimestamps,
-		}
+	if removeMetricGroup {
+		tsDst.MetricName.ResetMetricGroup()
 	}
-	appendRollupConfigs := func(dst []*rollupConfig) []*rollupConfig {
-		dst = append(dst, newRollupConfig(rollupMin, "min"))
-		dst = append(dst, newRollupConfig(rollupMax, "max"))
-		dst = append(dst, newRollupConfig(rollupAvg, "avg"))
-		return dst
-	}
-	var rcs []*rollupConfig
-	switch name {
-	case "rollup":
-		rcs = appendRollupConfigs(rcs)
-	case "rollup_rate", "rollup_deriv":
-		preFuncPrev := preFunc
-		preFunc = func(values []float64, timestamps []int64) {
-			preFuncPrev(values, timestamps)
-			derivValues(values, timestamps)
-		}
-		rcs = appendRollupConfigs(rcs)
-	case "rollup_increase", "rollup_delta":
-		preFuncPrev := preFunc
-		preFunc = func(values []float64, timestamps []int64) {
-			preFuncPrev(values, timestamps)
-			deltaValues(values)
-		}
-		rcs = appendRollupConfigs(rcs)
-	default:
-		rcs = append(rcs, newRollupConfig(rf, ""))
-	}
-	return preFunc, rcs
+	tsDst.Values = rc.Do(tsDst.Values[:0], valuesSrc, timestampsSrc)
+	tsDst.Timestamps = sharedTimestamps
+	tsDst.denyReuse = true
 }

 var bbPool bytesutil.ByteBufferPool
@@ -628,6 +801,8 @@ var bbPool bytesutil.ByteBufferPool
 func evalNumber(ec *EvalConfig, n float64) []*timeseries {
 	var ts timeseries
 	ts.denyReuse = true
+	ts.MetricName.AccountID = ec.AuthToken.AccountID
+	ts.MetricName.ProjectID = ec.AuthToken.ProjectID
 	timestamps := ec.getSharedTimestamps()
 	values := make([]float64, len(timestamps))
 	for i := range timestamps {
@@ -653,3 +828,31 @@ func evalTime(ec *EvalConfig) []*timeseries {
 	}
 	return rv
 }
+
+func mulNoOverflow(a, b int64) int64 {
+	if math.MaxInt64/b < a {
+		// Overflow
+		return math.MaxInt64
+	}
+	return a * b
+}
+
+func toTagFilters(lfs []metricsql.LabelFilter) []storage.TagFilter {
+	tfs := make([]storage.TagFilter, len(lfs))
+	for i := range lfs {
+		toTagFilter(&tfs[i], &lfs[i])
+	}
+	return tfs
+}
+
+func toTagFilter(dst *storage.TagFilter, src *metricsql.LabelFilter) {
+	if src.Label != "__name__" {
+		dst.Key = []byte(src.Label)
+	} else {
+		// This is required for storage.Search.
+		dst.Key = nil
+	}
+	dst.Value = []byte(src.Value)
+	dst.IsRegexp = src.IsRegexp
+	dst.IsNegative = src.IsNegative
+}
--- a/app/vmselect/promql/exec.go
+++ b/app/vmselect/promql/exec.go
@@ -1,29 +1,38 @@
 package promql

 import (
+	"flag"
 	"fmt"
 	"math"
 	"sort"
 	"sync"
 	"sync/atomic"
+	"time"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/netstorage"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
 	"github.com/VictoriaMetrics/metrics"
 )

-// ExpandWithExprs expands WITH expressions inside q and returns the resulting
-// PromQL without WITH expressions.
-func ExpandWithExprs(q string) (string, error) {
-	e, err := parsePromQLWithCache(q)
-	if err != nil {
-		return "", err
-	}
-	buf := e.AppendString(nil)
-	return string(buf), nil
-}
+var logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging")
+
+var slowQueries = metrics.NewCounter(`vm_slow_queries_total`)
+
+// Exec executes q for the given ec.
+func Exec(ec *EvalConfig, q string, isFirstPointOnly bool) ([]netstorage.Result, error) {
+	if *logSlowQueryDuration > 0 {
+		startTime := time.Now()
+		defer func() {
+			d := time.Since(startTime)
+			if d >= *logSlowQueryDuration {
+				logger.Infof("slow query according to -search.logSlowQueryDuration=%s: duration=%.3f seconds, start=%d, end=%d, step=%d, accountID=%d, projectID=%d, query=%q",
+					*logSlowQueryDuration, d.Seconds(), ec.Start/1000, ec.End/1000, ec.Step/1000, ec.AuthToken.AccountID, ec.AuthToken.ProjectID, q)
+				slowQueries.Inc()
+			}
+		}()
+	}

-// Exec executes q for the given ec until the deadline.
-func Exec(ec *EvalConfig, q string) ([]netstorage.Result, error) {
 	ec.validate()

 	e, err := parsePromQLWithCache(q)
@@ -31,24 +40,18 @@ func Exec(ec *EvalConfig, q string) ([]netstorage.Result, error) {
 		return nil, err
 	}

-	// Add an additional point to the end. This point is used
-	// in calculating the last value for rate, deriv, increase
-	// and delta funcs.
-	ec.End += ec.Step
-
 	rv, err := evalExpr(ec, e)
 	if err != nil {
 		return nil, err
 	}

-	// Remove the additional point at the end.
-	for _, ts := range rv {
-		ts.Values = ts.Values[:len(ts.Values)-1]
-
-		// ts.Timestamps may be shared between timeseries, so truncate it with len(ts.Values) instead of len(ts.Timestamps)-1
-		ts.Timestamps = ts.Timestamps[:len(ts.Values)]
+	if isFirstPointOnly {
+		// Remove all the points except the first one from every time series.
+		for _, ts := range rv {
+			ts.Values = ts.Values[:1]
+			ts.Timestamps = ts.Timestamps[:1]
+		}
 	}
-	ec.End -= ec.Step

 	maySort := maySortResults(e, rv)
 	result, err := timeseriesToResult(rv, maySort)
@@ -58,17 +61,18 @@ func Exec(ec *EvalConfig, q string) ([]netstorage.Result, error) {
 	return result, err
 }

-func maySortResults(e expr, tss []*timeseries) bool {
+func maySortResults(e metricsql.Expr, tss []*timeseries) bool {
 	if len(tss) > 100 {
 		// There is no sense in sorting a lot of results
 		return false
 	}
-	fe, ok := e.(*funcExpr)
+	fe, ok := e.(*metricsql.FuncExpr)
 	if !ok {
 		return true
 	}
 	switch fe.Name {
-	case "sort", "sort_desc":
+	case "sort", "sort_desc",
+		"sort_by_label", "sort_by_label_desc":
 		return false
 	default:
 		return true
@@ -78,14 +82,14 @@ func maySortResults(e expr, tss []*timeseries) bool {
 func timeseriesToResult(tss []*timeseries, maySort bool) ([]netstorage.Result, error) {
 	tss = removeNaNs(tss)
 	result := make([]netstorage.Result, len(tss))
-	m := make(map[string]bool)
+	m := make(map[string]struct{}, len(tss))
 	bb := bbPool.Get()
 	for i, ts := range tss {
 		bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
-		if m[string(bb.B)] {
-			return nil, fmt.Errorf(`duplicate output timeseries: %s%s`, ts.MetricName.MetricGroup, stringMetricName(&ts.MetricName))
+		if _, ok := m[string(bb.B)]; ok {
+			return nil, fmt.Errorf(`duplicate output timeseries: %s`, stringMetricName(&ts.MetricName))
 		}
-		m[string(bb.B)] = true
+		m[string(bb.B)] = struct{}{}

 		rs := &result[i]
 		rs.MetricNameMarshaled = append(rs.MetricNameMarshaled[:0], bb.B...)
@@ -107,25 +111,30 @@ func timeseriesToResult(tss []*timeseries, maySort bool) ([]netstorage.Result, e
 func removeNaNs(tss []*timeseries) []*timeseries {
 	rvs := tss[:0]
 	for _, ts := range tss {
-		nans := 0
+		allNans := true
 		for _, v := range ts.Values {
-			if math.IsNaN(v) {
-				nans++
+			if !math.IsNaN(v) {
+				allNans = false
+				break
 			}
 		}
-		if nans == len(ts.Values) {
+		if allNans {
 			// Skip timeseries with all NaNs.
 			continue
 		}
 		rvs = append(rvs, ts)
 	}
+	for i := len(rvs); i < len(tss); i++ {
+		// Zero unused time series, so GC could reclaim them.
+		tss[i] = nil
+	}
 	return rvs
 }

-func parsePromQLWithCache(q string) (expr, error) {
+func parsePromQLWithCache(q string) (metricsql.Expr, error) {
 	pcv := parseCacheV.Get(q)
 	if pcv == nil {
-		e, err := parsePromQL(q)
+		e, err := metricsql.Parse(q)
 		pcv = &parseCacheValue{
 			e:   e,
 			err: err,
@@ -157,16 +166,19 @@ var parseCacheV = func() *parseCache {
 const parseCacheMaxLen = 10e3

 type parseCacheValue struct {
-	e   expr
+	e   metricsql.Expr
 	err error
 }

 type parseCache struct {
-	m  map[string]*parseCacheValue
-	mu sync.RWMutex
+	// Move atomic counters to the top of struct for 8-byte alignment on 32-bit arch.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/212

 	requests uint64
 	misses   uint64
+
+	m  map[string]*parseCacheValue
+	mu sync.RWMutex
 }

 func (pc *parseCache) Requests() uint64 {
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
--- a/app/vmselect/promql/memory_limiter.go
+++ b/app/vmselect/promql/memory_limiter.go
@@ -0,0 +1,33 @@
+package promql
+
+import (
+	"sync"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)
+
+type memoryLimiter struct {
+	MaxSize uint64
+
+	mu    sync.Mutex
+	usage uint64
+}
+
+func (ml *memoryLimiter) Get(n uint64) bool {
+	ml.mu.Lock()
+	ok := n <= ml.MaxSize && ml.MaxSize-n >= ml.usage
+	if ok {
+		ml.usage += n
+	}
+	ml.mu.Unlock()
+	return ok
+}
+
+func (ml *memoryLimiter) Put(n uint64) {
+	ml.mu.Lock()
+	if n > ml.usage {
+		logger.Panicf("BUG: n=%d cannot exceed %d", n, ml.usage)
+	}
+	ml.usage -= n
+	ml.mu.Unlock()
+}
--- a/app/vmselect/promql/memory_limiter_test.go
+++ b/app/vmselect/promql/memory_limiter_test.go
@@ -0,0 +1,56 @@
+package promql
+
+import (
+	"testing"
+)
+
+func TestMemoryLimiter(t *testing.T) {
+	var ml memoryLimiter
+	ml.MaxSize = 100
+
+	// Allocate memory
+	if !ml.Get(10) {
+		t.Fatalf("cannot get 10 out of %d bytes", ml.MaxSize)
+	}
+	if ml.usage != 10 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 10)
+	}
+	if !ml.Get(20) {
+		t.Fatalf("cannot get 20 out of 90 bytes")
+	}
+	if ml.usage != 30 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
+	}
+	if ml.Get(1000) {
+		t.Fatalf("unexpected get for 1000 bytes")
+	}
+	if ml.usage != 30 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
+	}
+	if ml.Get(71) {
+		t.Fatalf("unexpected get for 71 bytes")
+	}
+	if ml.usage != 30 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
+	}
+	if !ml.Get(70) {
+		t.Fatalf("cannot get 70 bytes")
+	}
+	if ml.usage != 100 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 100)
+	}
+
+	// Return memory back
+	ml.Put(10)
+	ml.Put(70)
+	if ml.usage != 20 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 20)
+	}
+	if !ml.Get(30) {
+		t.Fatalf("cannot get 30 bytes")
+	}
+	ml.Put(50)
+	if ml.usage != 0 {
+		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 0)
+	}
+}
--- a/app/vmselect/promql/parser.go
+++ b/app/vmselect/promql/parser.go
--- a/app/vmselect/promql/parser_test.go
+++ b/app/vmselect/promql/parser_test.go
@@ -47,631 +47,3 @@ func TestParseMetricSelectorError(t *testing.T) {
 	f(`foo[5m]`)
 	f(`foo offset 5m`)
 }
-
-func TestParsePromQLSuccess(t *testing.T) {
-	another := func(s string, sExpected string) {
-		t.Helper()
-
-		e, err := parsePromQL(s)
-		if err != nil {
-			t.Fatalf("unexpected error when parsing %q: %s", s, err)
-		}
-		res := e.AppendString(nil)
-		if string(res) != sExpected {
-			t.Fatalf("unexpected string constructed;\ngot\n%q\nwant\n%q", res, sExpected)
-		}
-	}
-	same := func(s string) {
-		t.Helper()
-		another(s, s)
-	}
-
-	// metricExpr
-	same(`{}`)
-	same(`{}[5m]`)
-	same(`{}[5m:]`)
-	same(`{}[:]`)
-	another(`{}[: ]`, `{}[:]`)
-	same(`{}[:3s]`)
-	another(`{}[: 3s ]`, `{}[:3s]`)
-	same(`{}[5m:3s]`)
-	another(`{}[ 5m : 3s ]`, `{}[5m:3s]`)
-	same(`{} offset 5m`)
-	same(`{}[5m] offset 10y`)
-	same(`{}[5.3m:3.4s] offset 10y`)
-	same(`{}[:3.4s] offset 10y`)
-	same(`{Foo="bAR"}`)
-	same(`{foo="bar"}`)
-	same(`{foo="bar"}[5m]`)
-	same(`{foo="bar"}[5m:]`)
-	same(`{foo="bar"}[5m:3s]`)
-	same(`{foo="bar"} offset 10y`)
-	same(`{foo="bar"}[5m] offset 10y`)
-	same(`{foo="bar"}[5m:3s] offset 10y`)
-	another(`{foo="bar"}[5m] oFFSEt 10y`, `{foo="bar"}[5m] offset 10y`)
-	same("METRIC")
-	same("metric")
-	same("m_e:tri44:_c123")
-	another("-metric", "0 - metric")
-	same(`metric offset 10h`)
-	same("metric[5m]")
-	same("metric[5m:3s]")
-	same("metric[5m] offset 10h")
-	same("metric[5m:3s] offset 10h")
-	same("metric[5i:3i] offset 10i")
-	same(`metric{foo="bar"}`)
-	same(`metric{foo="bar"} offset 10h`)
-	same(`metric{foo!="bar"}[2d]`)
-	same(`metric{foo="bar"}[2d] offset 10h`)
-	same(`metric{foo="bar", b="sdfsdf"}[2d:3h] offset 10h`)
-	another(`  metric  {  foo  = "bar"  }  [  2d ]   offset   10h  `, `metric{foo="bar"}[2d] offset 10h`)
-	// metric name matching keywords
-	same("rate")
-	same("RATE")
-	same("by")
-	same("BY")
-	same("bool")
-	same("BOOL")
-	same("unless")
-	same("UNLESS")
-	same("Ignoring")
-	same("with")
-	same("WITH")
-	same("With")
-	// Metric filters ending with comma
-	another(`m{foo="bar",}`, `m{foo="bar"}`)
-	// String concat in tag value
-	another(`m{foo="bar" + "baz"}`, `m{foo="barbaz"}`)
-
-	// Valid regexp
-	same(`foo{bar=~"x"}`)
-	same(`foo{bar=~"^x"}`)
-	same(`foo{bar=~"^x$"}`)
-	same(`foo{bar=~"^(a[bc]|d)$"}`)
-	same(`foo{bar!~"x"}`)
-	same(`foo{bar!~"^x"}`)
-	same(`foo{bar!~"^x$"}`)
-	same(`foo{bar!~"^(a[bc]|d)$"}`)
-
-	// stringExpr
-	same(`""`)
-	same(`"\n\t\r 12:{}[]()44"`)
-	another(`''`, `""`)
-	another("``", `""`)
-	another("   `foo\"b'ar`  ", "\"foo\\\"b'ar\"")
-	another(`  'foo\'bar"BAZ'  `, `"foo'bar\"BAZ"`)
-	// string concat
-	another(`"foo"+'bar'`, `"foobar"`)
-
-	// numberExpr
-	same(`1`)
-	same(`1.23`)
-	same(`0.23`)
-	same(`1.2e+45`)
-	same(`1.2e-45`)
-	same(`-1`)
-	same(`-1.23`)
-	same(`-0.23`)
-	same(`-1.2e+45`)
-	same(`-1.2e-45`)
-	same(`-1.2e-45`)
-	another(`12.5E34`, `1.25e+35`)
-	another(`-.2`, `-0.2`)
-	another(`-.2E-2`, `-0.002`)
-	same(`NaN`)
-	another(`Inf`, `+Inf`)
-	another(`+Inf`, `+Inf`)
-	another(`-Inf`, `-Inf`)
-
-	// binaryOpExpr
-	another(`NaN + 2 *3 * Inf`, `NaN`)
-	another(`Inf - Inf`, `NaN`)
-	another(`Inf + Inf`, `+Inf`)
-	another(`-m`, `0 - m`)
-	same(`m + ignoring () n[5m]`)
-	another(`M + IGNORING () N[5m]`, `M + ignoring () N[5m]`)
-	same(`m + on (foo) n[5m]`)
-	another(`m + ON (Foo) n[5m]`, `m + on (Foo) n[5m]`)
-	same(`m + ignoring (a, b) n[5m]`)
-	another(`1 or 2`, `1`)
-	another(`1 and 2`, `1`)
-	another(`1 unless 2`, `NaN`)
-	another(`1 default 2`, `1`)
-	another(`1 default NaN`, `1`)
-	another(`NaN default 2`, `2`)
-	another(`1 > 2`, `NaN`)
-	another(`1 > bool 2`, `0`)
-	another(`3 >= 2`, `3`)
-	another(`3 <= bool 2`, `0`)
-	another(`1 + -2 - 3`, `-4`)
-	another(`1 / 0 + 2`, `+Inf`)
-	another(`2 + -1 / 0`, `-Inf`)
-	another(`-1 ^ 0.5`, `NaN`)
-	another(`512.5 - (1 + 3) * (2 ^ 2) ^ 3`, `256.5`)
-	another(`1 == bool 1 != bool 24 < bool 4 > bool -1`, `1`)
-	another(`1 == bOOl 1 != BOOL 24 < Bool 4 > booL -1`, `1`)
-	another(`m1+on(foo)group_left m2`, `m1 + on (foo) group_left () m2`)
-	another(`M1+ON(FOO)GROUP_left M2`, `M1 + on (FOO) group_left () M2`)
-	same(`m1 + on (foo) group_right () m2`)
-	same(`m1 + on (foo, bar) group_right (x, y) m2`)
-	another(`m1 + on (foo, bar,) group_right (x, y,) m2`, `m1 + on (foo, bar) group_right (x, y) m2`)
-	same(`m1 == bool on (foo, bar) group_right (x, y) m2`)
-	another(`5 - 1 + 3 * 2 ^ 2 ^ 3 - 2  OR Metric {Bar= "Baz", aaa!="bb",cc=~"dd" ,zz !~"ff" } `,
-		`770 or Metric{Bar="Baz", aaa!="bb", cc=~"dd", zz!~"ff"}`)
-	same(`"foo" + bar()`)
-	same(`"foo" + bar{x="y"}`)
-	same(`("foo"[3s] + bar{x="y"})[5m:3s] offset 10s`)
-	same(`("foo"[3s] + bar{x="y"})[5i:3i] offset 10i`)
-	same(`bar + "foo" offset 3s`)
-	same(`bar + "foo" offset 3i`)
-	another(`1+2 if 2>3`, `NaN`)
-	another(`1+4 if 2<3`, `5`)
-	another(`2+6 default 3 if 2>3`, `8`)
-	another(`2+6 if 2>3 default NaN`, `NaN`)
-	another(`42 if 3>2 if 2+2<5`, `42`)
-	another(`42 if 3>2 if 2+2>=5`, `NaN`)
-	another(`1+2 ifnot 2>3`, `3`)
-	another(`1+4 ifnot 2<3`, `NaN`)
-	another(`2+6 default 3 ifnot 2>3`, `8`)
-	another(`2+6 ifnot 2>3 default NaN`, `8`)
-	another(`42 if 3>2 ifnot 2+2<5`, `NaN`)
-	another(`42 if 3>2 ifnot 2+2>=5`, `42`)
-
-	// parensExpr
-	another(`(-foo + ((bar) / (baz))) + ((23))`, `((0 - foo) + (bar / baz)) + 23`)
-	another(`(FOO + ((Bar) / (baZ))) + ((23))`, `(FOO + (Bar / baZ)) + 23`)
-	same(`(foo, bar)`)
-	another(`1+(foo, bar,)`, `1 + (foo, bar)`)
-	another(`((foo(bar,baz)), (1+(2)+(3,4)+()))`, `(foo(bar, baz), (3 + (3, 4)) + ())`)
-	same(`()`)
-
-	// funcExpr
-	same(`f()`)
-	another(`f(x,)`, `f(x)`)
-	another(`-f()-Ff()`, `(0 - f()) - Ff()`)
-	same(`F()`)
-	another(`+F()`, `F()`)
-	another(`++F()`, `F()`)
-	another(`--F()`, `0 - (0 - F())`)
-	same(`f(http_server_request)`)
-	same(`f(http_server_request)[4s:5m] offset 10m`)
-	same(`f(http_server_request)[4i:5i] offset 10i`)
-	same(`F(HttpServerRequest)`)
-	same(`f(job, foo)`)
-	same(`F(Job, Foo)`)
-	another(` FOO (bar) + f  (  m  (  ),ff(1 + (  2.5)) ,M[5m ]  , "ff"  )`, `FOO(bar) + f(m(), ff(3.5), M[5m], "ff")`)
-	// funcName matching keywords
-	same(`by(2)`)
-	same(`BY(2)`)
-	same(`or(2)`)
-	same(`OR(2)`)
-	same(`bool(2)`)
-	same(`BOOL(2)`)
-	same(`rate(rate(m))`)
-	same(`rate(rate(m[5m]))`)
-	same(`rate(rate(m[5m])[1h:])`)
-	same(`rate(rate(m[5m])[1h:3s])`)
-
-	// aggrFuncExpr
-	same(`sum(http_server_request) by ()`)
-	same(`sum(http_server_request) by (job)`)
-	same(`sum(http_server_request) without (job, foo)`)
-	another(`sum(x,y,) without (a,b,)`, `sum(x, y) without (a, b)`)
-	another(`sum by () (xx)`, `sum(xx) by ()`)
-	another(`sum by (s) (xx)[5s]`, `(sum(xx) by (s))[5s]`)
-	another(`SUM BY (ZZ, aa) (XX)`, `sum(XX) by (ZZ, aa)`)
-	another(`sum without (a, b) (xx,2+2)`, `sum(xx, 4) without (a, b)`)
-	another(`Sum WIthout (a, B) (XX,2+2)`, `sum(XX, 4) without (a, B)`)
-	same(`sum(a) or sum(b)`)
-	same(`sum(a) by () or sum(b) without (x, y)`)
-	same(`sum(a) + sum(b)`)
-	same(`sum(x) * (1 + sum(a))`)
-
-	// All the above
-	another(`Sum(Ff(M) * M{X=""}[5m] Offset 7m - 123, 35) BY (X, y) * F2("Test")`,
-		`sum((Ff(M) * M{X=""}[5m] offset 7m) - 123, 35) by (X, y) * F2("Test")`)
-	another(`# comment
-		Sum(Ff(M) * M{X=""}[5m] Offset 7m - 123, 35) BY (X, y) # yet another comment
-		* F2("Test")`,
-		`sum((Ff(M) * M{X=""}[5m] offset 7m) - 123, 35) by (X, y) * F2("Test")`)
-
-	// withExpr
-	another(`with () x`, `x`)
-	another(`with (x=1,) x`, `1`)
-	another(`with (x = m offset 5h) x + x`, `m offset 5h + m offset 5h`)
-	another(`with (x = m offset 5i) x + x`, `m offset 5i + m offset 5i`)
-	another(`with (foo = bar{x="x"}) 1`, `1`)
-	another(`with (foo = bar{x="x"}) "x"`, `"x"`)
-	another(`with (f="x") f`, `"x"`)
-	another(`with (foo = bar{x="x"}) x{x="y"}`, `x{x="y"}`)
-	another(`with (foo = bar{x="x"}) 1+1`, `2`)
-	another(`with (foo = bar{x="x"}) f()`, `f()`)
-	another(`with (foo = bar{x="x"}) sum(x)`, `sum(x)`)
-	another(`with (foo = bar{x="x"}) baz{foo="bar"}`, `baz{foo="bar"}`)
-	another(`with (foo = bar) baz`, `baz`)
-	another(`with (foo = bar) foo + foo{a="b"}`, `bar + bar{a="b"}`)
-	another(`with (foo = bar, bar=baz + f()) test`, `test`)
-	another(`with (ct={job="test"}) a{ct} + ct() + f({ct="x"})`, `(a{job="test"} + {job="test"}) + f({ct="x"})`)
-	another(`with (ct={job="test", i="bar"}) ct + {ct, x="d"} + foo{ct, ct} + ctx(1)`,
-		`(({job="test", i="bar"} + {job="test", i="bar", x="d"}) + foo{job="test", i="bar"}) + ctx(1)`)
-	another(`with (foo = bar) {__name__=~"foo"}`, `{__name__=~"foo"}`)
-	another(`with (foo = bar) {__name__="foo"}`, `bar`)
-	another(`with (foo = bar) {__name__="foo", x="y"}`, `bar{x="y"}`)
-	another(`with (foo(bar) = {__name__!="bar"}) foo(x)`, `{__name__!="bar"}`)
-	another(`with (foo(bar) = {__name__="bar"}) foo(x)`, `x`)
-	// override ttf to something new.
-	another(`with (ttf = a) ttf + b`, `a + b`)
-	// override ttf to ru
-	another(`with (ttf = ru(m, n)) ttf`, `(clamp_min(n - clamp_min(m, 0), 0) / clamp_min(n, 0)) * 100`)
-
-	// Verify withExpr recursion and forward reference
-	another(`with (x = x+y, y = x+x) y ^ 2`, `((x + y) + (x + y)) ^ 2`)
-	another(`with (f1(x)=f2(x), f2(x)=f1(x)^2) f1(foobar)`, `f2(foobar)`)
-	another(`with (f1(x)=f2(x), f2(x)=f1(x)^2) f2(foobar)`, `f2(foobar) ^ 2`)
-
-	// Verify withExpr funcs
-	another(`with (x() = y+1) x`, `y + 1`)
-	another(`with (x(foo) = foo+1) x(a)`, `a + 1`)
-	another(`with (x(a, b) = a + b) x(foo, bar)`, `foo + bar`)
-	another(`with (x(a, b) = a + b) x(foo, x(1, 2))`, `foo + 3`)
-	another(`with (x(a) = sum(a) by (b)) x(xx) / x(y)`, `sum(xx) by (b) / sum(y) by (b)`)
-	another(`with (f(a,f,x)=ff(x,f,a)) f(f(x,y,z),1,2)`, `ff(2, 1, ff(z, y, x))`)
-	another(`with (f(x)=1+f(x)) f(foo{bar="baz"})`, `1 + f(foo{bar="baz"})`)
-	another(`with (a=foo, y=bar, f(a)= a+a+y) f(x)`, `(x + x) + bar`)
-	another(`with (f(a, b) = m{a, b}) f({a="x", b="y"}, {c="d"})`, `m{a="x", b="y", c="d"}`)
-	another(`with (xx={a="x"}, f(a, b) = m{a, b}) f({xx, b="y"}, {c="d"})`, `m{a="x", b="y", c="d"}`)
-	another(`with (x() = {b="c"}) foo{x}`, `foo{b="c"}`)
-	another(`with (f(x)=x{foo="bar"} offset 5m) f(m offset 10m)`, `(m{foo="bar"} offset 10m) offset 5m`)
-	another(`with (f(x)=x{foo="bar",bas="a"}[5m]) f(m[10m] offset 3s)`, `(m{foo="bar", bas="a"}[10m] offset 3s)[5m]`)
-	another(`with (f(x)=x{foo="bar"}[5m] offset 10m) f(m{x="y"})`, `m{x="y", foo="bar"}[5m] offset 10m`)
-	another(`with (f(x)=x{foo="bar"}[5m] offset 10m) f({x="y", foo="bar", foo="bar"})`, `{x="y", foo="bar"}[5m] offset 10m`)
-	another(`with (f(m, x)=m{x}[5m] offset 10m) f(foo, {})`, `foo[5m] offset 10m`)
-	another(`with (f(m, x)=m{x, bar="baz"}[5m] offset 10m) f(foo, {})`, `foo{bar="baz"}[5m] offset 10m`)
-	another(`with (f(x)=x[5m] offset 3s) f(foo[3m]+bar)`, `(foo[3m] + bar)[5m] offset 3s`)
-	another(`with (f(x)=x[5m:3s] oFFsEt 1.5m) f(sum(s) by (a,b))`, `(sum(s) by (a, b))[5m:3s] offset 1.5m`)
-	another(`with (x="a", y=x) y+"bc"`, `"abc"`)
-	another(`with (x="a", y="b"+x) "we"+y+"z"+f()`, `"webaz" + f()`)
-	another(`with (f(x) = m{foo=x+"y", bar="y"+x, baz=x} + x) f("qwe")`, `m{foo="qwey", bar="yqwe", baz="qwe"} + "qwe"`)
-
-	// Verify withExpr for aggr func modifiers
-	another(`with (f(x) = sum(m) by (x)) f(foo)`, `sum(m) by (foo)`)
-	another(`with (f(x) = sum(m) by (x)) f((foo, bar, foo))`, `sum(m) by (foo, bar)`)
-	another(`with (f(x) = sum(m) without (x,y)) f((a, b))`, `sum(m) without (a, b, y)`)
-	another(`with (f(x) = sum(m) without (y,x)) f((a, y))`, `sum(m) without (y, a)`)
-	another(`with (f(x,y) = a + on (x,y) group_left (y,bar) b) f(foo,())`, `a + on (foo) group_left (bar) b`)
-	another(`with (f(x,y) = a + on (x,y) group_left (y,bar) b) f((foo),())`, `a + on (foo) group_left (bar) b`)
-	another(`with (f(x,y) = a + on (x,y) group_left (y,bar) b) f((foo,xx),())`, `a + on (foo, xx) group_left (bar) b`)
-
-	// Verify nested with exprs
-	another(`with (f(x) = (with(x=y) x) + x) f(z)`, `y + z`)
-	another(`with (x=foo) f(a, with (y=x) y)`, `f(a, foo)`)
-	another(`with (x=foo) a * x + (with (y=x) y) / y`, `(a * foo) + (foo / y)`)
-	another(`with (x = with (y = foo) y + x) x/x`, `(foo + x) / (foo + x)`)
-	another(`with (
-		x = {foo="bar"},
-		q = m{x, y="1"},
-		f(x) =
-			with (
-				z(y) = x + y * q
-			)
-			z(foo) / f(x)
-	)
-	f(a)`, `(a + (foo * m{foo="bar", y="1"})) / f(a)`)
-
-	// complex withExpr
-	another(`WITH (
-		treshold = (0.9),
-		commonFilters = {job="cacher", instance=~"1.2.3.4"},
-		hits = rate(cache{type="hit", commonFilters}[5m]),
-		miss = rate(cache{type="miss", commonFilters}[5m]),
-		sumByInstance(arg) = sum(arg) by (instance),
-		hitRatio = sumByInstance(hits) / sumByInstance(hits + miss)
-	)
-	hitRatio < treshold`,
-		`(sum(rate(cache{type="hit", job="cacher", instance=~"1.2.3.4"}[5m])) by (instance) / sum(rate(cache{type="hit", job="cacher", instance=~"1.2.3.4"}[5m]) + rate(cache{type="miss", job="cacher", instance=~"1.2.3.4"}[5m])) by (instance)) < 0.9`)
-	another(`WITH (
-		x2(x) = x^2,
-		f(x, y) = x2(x) + x*y + x2(y)
-	)
-	f(a, 3)
-	`, `((a ^ 2) + (a * 3)) + 9`)
-	another(`WITH (
-		x2(x) = x^2,
-		f(x, y) = x2(x) + x*y + x2(y)
-	)
-	f(2, 3)
-	`, `19`)
-	another(`WITH (
-		commonFilters = {instance="foo"},
-		timeToFuckup(currv, maxv) = (maxv - currv) / rate(currv)
-	)
-	timeToFuckup(diskUsage{commonFilters}, maxDiskSize{commonFilters})`,
-		`(maxDiskSize{instance="foo"} - diskUsage{instance="foo"}) / rate(diskUsage{instance="foo"})`)
-	another(`WITH (
-	       commonFilters = {job="foo", instance="bar"},
-	       sumRate(m, cf) = sum(rate(m{cf})) by (job, instance),
-	       hitRate(hits, misses) = sumRate(hits, commonFilters) / (sumRate(hits, commonFilters) + sumRate(misses, commonFilters))
-	   )
-	   hitRate(cacheHits, cacheMisses)`,
-		`sum(rate(cacheHits{job="foo", instance="bar"})) by (job, instance) / (sum(rate(cacheHits{job="foo", instance="bar"})) by (job, instance) + sum(rate(cacheMisses{job="foo", instance="bar"})) by (job, instance))`)
-	another(`with(y=123,z=5) union(with(y=3,f(x)=x*y) f(2) + f(3), with(x=5,y=2) x*y*z)`, `union(15, 50)`)
-}
-
-func TestParsePromQLError(t *testing.T) {
-	f := func(s string) {
-		t.Helper()
-
-		e, err := parsePromQL(s)
-		if err == nil {
-			t.Fatalf("expecting non-nil error when parsing %q", s)
-		}
-		if e != nil {
-			t.Fatalf("expecting nil expr when parsing %q", s)
-		}
-	}
-
-	// an empty string
-	f("")
-	f("  \t\b\r\n  ")
-
-	// invalid metricExpr
-	f(`{__name__="ff"} offset 55`)
-	f(`{__name__="ff"} offset -5m`)
-	f(`foo[55]`)
-	f(`m[-5m]`)
-	f(`{`)
-	f(`foo{`)
-	f(`foo{bar`)
-	f(`foo{bar=`)
-	f(`foo{bar="baz"`)
-	f(`foo{bar="baz",  `)
-	f(`foo{123="23"}`)
-	f(`foo{foo}`)
-	f(`foo{,}`)
-	f(`foo{,foo="bar"}`)
-	f(`foo{foo=}`)
-	f(`foo{foo="ba}`)
-	f(`foo{"foo"="bar"}`)
-	f(`foo{$`)
-	f(`foo{a $`)
-	f(`foo{a="b",$`)
-	f(`foo{a="b"}$`)
-	f(`[`)
-	f(`[]`)
-	f(`f[5m]$`)
-	f(`[5m]`)
-	f(`[5m] offset 4h`)
-	f(`m[5m] offset $`)
-	f(`m[5m] offset 5h $`)
-	f(`m[]`)
-	f(`m[-5m]`)
-	f(`m[5m:`)
-	f(`m[5m:-`)
-	f(`m[5m:-1`)
-	f(`m[5m:-1]`)
-	f(`m[:`)
-	f(`m[:-`)
-	f(`m[:1]`)
-	f(`m[:-1m]`)
-	f(`m[5]`)
-	f(`m[[5m]]`)
-	f(`m[foo]`)
-	f(`m["ff"]`)
-	f(`m[10m`)
-	f(`m[123`)
-	f(`m["ff`)
-	f(`m[(f`)
-	f(`fd}`)
-	f(`]`)
-	f(`m $`)
-	f(`m{,}`)
-	f(`m{x=y}`)
-	f(`m{x=y/5}`)
-	f(`m{x=y+5}`)
-
-	// Invalid regexp
-	f(`foo{bar=~"x["}`)
-	f(`foo{bar=~"x("}`)
-	f(`foo{bar=~"x)"}`)
-	f(`foo{bar!~"x["}`)
-	f(`foo{bar!~"x("}`)
-	f(`foo{bar!~"x)"}`)
-
-	// invalid stringExpr
-	f(`'`)
-	f(`"`)
-	f("`")
-	f(`"foo`)
-	f(`'foo`)
-	f("`foo")
-	f(`"foo\"bar`)
-	f(`'foo\'bar`)
-	f("`foo\\`bar")
-	f(`"" $`)
-	f(`"foo" +`)
-	f(`n{"foo" + m`)
-
-	// invalid numberExpr
-	f(`12.`)
-	f(`1.2e`)
-	f(`23e-`)
-	f(`23E+`)
-	f(`.`)
-	f(`-12.`)
-	f(`-1.2e`)
-	f(`-23e-`)
-	f(`-23E+`)
-	f(`-.`)
-	f(`-1$$`)
-	f(`-$$`)
-	f(`+$$`)
-	f(`23 $$`)
-
-	// invalid binaryOpExpr
-	f(`+`)
-	f(`1 +`)
-	f(`1 + 2.`)
-	f(`3 unless`)
-	f(`23 + on (foo)`)
-	f(`m + on (,) m`)
-	f(`3 * ignoring`)
-	f(`m * on (`)
-	f(`m * on (foo`)
-	f(`m * on (foo,`)
-	f(`m * on (foo,)`)
-	f(`m * on (,foo)`)
-	f(`m * on (,)`)
-	f(`m == bool (bar) baz`)
-	f(`m == bool () baz`)
-	f(`m * by (baz) n`)
-	f(`m + bool group_left m2`)
-	f(`m + on () group_left (`)
-	f(`m + on () group_left (,`)
-	f(`m + on () group_left (,foo`)
-	f(`m + on () group_left (foo,)`)
-	f(`m + on () group_left (,foo)`)
-	f(`m + on () group_left (foo)`)
-	f(`m + on () group_right (foo) (m`)
-	f(`m or ignoring () group_left () n`)
-	f(`1 + bool 2`)
-	f(`m % bool n`)
-	f(`m * bool baz`)
-	f(`M * BOoL BaZ`)
-	f(`foo unless ignoring (bar) group_left xxx`)
-	f(`foo or bool bar`)
-	f(`foo == bool $$`)
-	f(`"foo" + bar`)
-
-	// invalid parensExpr
-	f(`(`)
-	f(`($`)
-	f(`(+`)
-	f(`(1`)
-	f(`(m+`)
-	f(`1)`)
-	f(`(,)`)
-	f(`(1)$`)
-
-	// invalid funcExpr
-	f(`f $`)
-	f(`f($)`)
-	f(`f[`)
-	f(`f()$`)
-	f(`f(`)
-	f(`f(foo`)
-	f(`f(f,`)
-	f(`f(,`)
-	f(`f(,)`)
-	f(`f(,foo)`)
-	f(`f(,foo`)
-	f(`f(foo,$`)
-	f(`f() by (a)`)
-	f(`f without (x) (y)`)
-	f(`f() foo (a)`)
-	f(`f bar (x) (b)`)
-	f(`f bar (x)`)
-
-	// invalid aggrFuncExpr
-	f(`sum(`)
-	f(`sum $`)
-	f(`sum [`)
-	f(`sum($)`)
-	f(`sum()$`)
-	f(`sum(foo) ba`)
-	f(`sum(foo) ba()`)
-	f(`sum(foo) by`)
-	f(`sum(foo) without x`)
-	f(`sum(foo) aaa`)
-	f(`sum(foo) aaa x`)
-	f(`sum() by $`)
-	f(`sum() by (`)
-	f(`sum() by ($`)
-	f(`sum() by (a`)
-	f(`sum() by (a $`)
-	f(`sum() by (a ]`)
-	f(`sum() by (a)$`)
-	f(`sum() by (,`)
-	f(`sum() by (a,$`)
-	f(`sum() by (,)`)
-	f(`sum() by (,a`)
-	f(`sum() by (,a)`)
-	f(`sum() on (b)`)
-	f(`sum() bool`)
-	f(`sum() group_left`)
-	f(`sum() group_right(x)`)
-	f(`sum ba`)
-	f(`sum ba ()`)
-	f(`sum by (`)
-	f(`sum by (a`)
-	f(`sum by (,`)
-	f(`sum by (,)`)
-	f(`sum by (,a`)
-	f(`sum by (,a)`)
-	f(`sum by (a)`)
-	f(`sum by (a) (`)
-	f(`sum by (a) [`)
-	f(`sum by (a) {`)
-	f(`sum by (a) (b`)
-	f(`sum by (a) (b,`)
-	f(`sum by (a) (,)`)
-	f(`avg by (a) (,b)`)
-	f(`sum by (x) (y) by (z)`)
-	f(`sum(m) by (1)`)
-
-	// invalid withExpr
-	f(`with $`)
-	f(`with a`)
-	f(`with a=b c`)
-	f(`with (`)
-	f(`with (x=b)$`)
-	f(`with ($`)
-	f(`with (foo`)
-	f(`with (foo $`)
-	f(`with (x y`)
-	f(`with (x =`)
-	f(`with (x = $`)
-	f(`with (x= y`)
-	f(`with (x= y $`)
-	f(`with (x= y)`)
-	f(`with (x=(`)
-	f(`with (x=[)`)
-	f(`with (x=() x)`)
-	f(`with ($$)`)
-	f(`with (x $$`)
-	f(`with (x = $$)`)
-	f(`with (x = foo) bar{x}`)
-	f(`with (x = {foo="bar"}[5m]) bar{x}`)
-	f(`with (x = {foo="bar"} offset 5m) bar{x}`)
-	f(`with (x = a, x = b) c`)
-	f(`with (x(a, a) = b) c`)
-	f(`with (x=m{f="x"}) foo{x}`)
-	f(`with (sum = x) y`)
-	f(`with (rate(a) = b) c`)
-	f(`with (clamp_min=x) y`)
-	f(`with (f()`)
-	f(`with (a=b c=d) e`)
-	f(`with (f(x)=x^2) m{x}`)
-	f(`with (f(x)=ff()) m{x}`)
-	f(`with (f(x`)
-	f(`with (x=m) a{x} + b`)
-	f(`with (x=m) b + a{x}`)
-	f(`with (x=m) f(b, a{x})`)
-	f(`with (x=m) sum(a{x})`)
-	f(`with (x=m) (a{x})`)
-	f(`with (f(a)=a) f`)
-	f(`with (f(x)=x{foo="bar"}) f(1)`)
-	f(`with (f(x)=x{foo="bar"}) f(m + n)`)
-	f(`with (f = with`)
-	f(`with (,)`)
-	f(`with (1) 2`)
-	f(`with (f(1)=2) 3`)
-	f(`with (f(,)=x) x`)
-	f(`with (x(a) = {b="c"}) foo{x}`)
-	f(`with (f(x) = m{foo=xx}) f("qwe")`)
-	f(`a + with(f(x)=x) f`)
-	f(`with (f(x) = x, y = sum(m) by (f)) y`)
-	f(`with (f(x) = sum(m) by (x)) f({foo="bar"})`)
-	f(`with (f(x) = sum(m) by (x)) f((xx(), {foo="bar"}))`)
-	f(`with (f(x) = m + on (x) n) f(xx())`)
-	f(`with (f(x) = m + on (a) group_right (x) n) f(xx())`)
-}
--- a/app/vmselect/promql/rollup.go
+++ b/app/vmselect/promql/rollup.go
--- a/app/vmselect/promql/rollup_result_cache.go
+++ b/app/vmselect/promql/rollup_result_cache.go
@@ -2,21 +2,32 @@ package promql

 import (
 	"crypto/rand"
+	"flag"
 	"fmt"
-	"runtime"
 	"sync"
 	"sync/atomic"
 	"time"

+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache"
 	"github.com/VictoriaMetrics/fastcache"
 	"github.com/VictoriaMetrics/metrics"
 )

+var (
+	disableCache         = flag.Bool("search.disableCache", false, "Whether to disable response caching. This may be useful during data backfilling")
+	cacheTimestampOffset = flag.Duration("search.cacheTimestampOffset", 5*time.Minute, "The maximum duration since the current time for response data, "+
+		"which is always queried from the original raw data, without using the response cache. Increase this value if you see gaps in responses "+
+		"due to time synchronization issues between VictoriaMetrics and data sources")
+)
+
 var rollupResultCacheV = &rollupResultCache{
-	fastcache.New(1024 * 1024), // This is a cache for testing.
+	c: workingsetcache.New(1024*1024, time.Hour), // This is a cache for testing.
 }
 var rollupResultCachePath string

@@ -37,16 +48,23 @@ var (
 )

 // InitRollupResultCache initializes the rollupResult cache
+//
+// if cachePath is empty, then the cache isn't stored to persistent disk.
 func InitRollupResultCache(cachePath string) {
 	rollupResultCachePath = cachePath
 	startTime := time.Now()
-	var c *fastcache.Cache
+	cacheSize := getRollupResultCacheSize()
+	var c *workingsetcache.Cache
 	if len(rollupResultCachePath) > 0 {
 		logger.Infof("loading rollupResult cache from %q...", rollupResultCachePath)
-		c = fastcache.LoadFromFileOrNew(rollupResultCachePath, getRollupResultCacheSize())
+		c = workingsetcache.Load(rollupResultCachePath, cacheSize, time.Hour)
 	} else {
-		c = fastcache.New(getRollupResultCacheSize())
+		c = workingsetcache.New(cacheSize, time.Hour)
 	}
+	if *disableCache {
+		c.Reset()
+	}
+
 	stats := &fastcache.Stats{}
 	var statsLock sync.Mutex
 	var statsLastUpdate time.Time
@@ -64,8 +82,8 @@ func InitRollupResultCache(cachePath string) {
 		return stats
 	}
 	if len(rollupResultCachePath) > 0 {
-		logger.Infof("loaded rollupResult cache from %q in %s; entriesCount: %d, bytesSize: %d",
-			rollupResultCachePath, time.Since(startTime), fcs().EntriesCount, fcs().BytesSize)
+		logger.Infof("loaded rollupResult cache from %q in %.3f seconds; entriesCount: %d, sizeBytes: %d",
+			rollupResultCachePath, time.Since(startTime).Seconds(), fcs().EntriesCount, fcs().BytesSize)
 	}

 	metrics.NewGauge(`vm_cache_entries{type="promql/rollupResult"}`, func() float64 {
@@ -89,25 +107,28 @@ func InitRollupResultCache(cachePath string) {
 // StopRollupResultCache closes the rollupResult cache.
 func StopRollupResultCache() {
 	if len(rollupResultCachePath) == 0 {
-		rollupResultCacheV.c.Reset()
+		rollupResultCacheV.c.Stop()
+		rollupResultCacheV.c = nil
 		return
 	}
-	gomaxprocs := runtime.GOMAXPROCS(-1)
 	logger.Infof("saving rollupResult cache to %q...", rollupResultCachePath)
 	startTime := time.Now()
-	if err := rollupResultCacheV.c.SaveToFileConcurrent(rollupResultCachePath, gomaxprocs); err != nil {
+	if err := rollupResultCacheV.c.Save(rollupResultCachePath); err != nil {
 		logger.Errorf("cannot close rollupResult cache at %q: %s", rollupResultCachePath, err)
-	} else {
-		var fcs fastcache.Stats
-		rollupResultCacheV.c.UpdateStats(&fcs)
-		rollupResultCacheV.c.Reset()
-		logger.Infof("saved rollupResult cache to %q in %s; entriesCount: %d, bytesSize: %d",
-			rollupResultCachePath, time.Since(startTime), fcs.EntriesCount, fcs.BytesSize)
+		return
 	}
+	var fcs fastcache.Stats
+	rollupResultCacheV.c.UpdateStats(&fcs)
+	rollupResultCacheV.c.Stop()
+	rollupResultCacheV.c = nil
+	logger.Infof("saved rollupResult cache to %q in %.3f seconds; entriesCount: %d, sizeBytes: %d",
+		rollupResultCachePath, time.Since(startTime).Seconds(), fcs.EntriesCount, fcs.BytesSize)
 }

+// TODO: convert this cache to distributed cache shared among vmselect
+// instances in the cluster.
 type rollupResultCache struct {
-	c *fastcache.Cache
+	c *workingsetcache.Cache
 }

 var rollupResultCacheResets = metrics.NewCounter(`vm_cache_resets_total{type="promql/rollupResult"}`)
@@ -116,10 +137,11 @@ var rollupResultCacheResets = metrics.NewCounter(`vm_cache_resets_total{type="pr
 func ResetRollupResultCache() {
 	rollupResultCacheResets.Inc()
 	rollupResultCacheV.c.Reset()
+	logger.Infof("rollupResult cache has been cleared")
 }

-func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExpr, window int64) (tss []*timeseries, newStart int64) {
-	if !ec.mayCache() {
+func (rrc *rollupResultCache) Get(ec *EvalConfig, expr metricsql.Expr, window int64) (tss []*timeseries, newStart int64) {
+	if *disableCache || !ec.mayCache() {
 		return nil, ec.Start
 	}

@@ -127,7 +149,7 @@ func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExp
 	bb := bbPool.Get()
 	defer bbPool.Put(bb)

-	bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step)
+	bb.B = marshalRollupResultCacheKey(bb.B[:0], ec.AuthToken, expr, window, ec.Step)
 	metainfoBuf := rrc.c.Get(nil, bb.B)
 	if len(metainfoBuf) == 0 {
 		return nil, ec.Start
@@ -141,15 +163,23 @@ func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExp
 		return nil, ec.Start
 	}
 	bb.B = key.Marshal(bb.B[:0])
-	resultBuf := rrc.c.GetBig(nil, bb.B)
-	if len(resultBuf) == 0 {
+	compressedResultBuf := resultBufPool.Get()
+	defer resultBufPool.Put(compressedResultBuf)
+	compressedResultBuf.B = rrc.c.GetBig(compressedResultBuf.B[:0], bb.B)
+	if len(compressedResultBuf.B) == 0 {
 		mi.RemoveKey(key)
 		metainfoBuf = mi.Marshal(metainfoBuf[:0])
-		bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step)
+		bb.B = marshalRollupResultCacheKey(bb.B[:0], ec.AuthToken, expr, window, ec.Step)
 		rrc.c.Set(bb.B, metainfoBuf)
 		return nil, ec.Start
 	}
-	tss, err := unmarshalTimeseriesFast(resultBuf)
+	// Decompress into newly allocated byte slice, since tss returned from unmarshalTimeseriesFast
+	// refers to the byte slice, so it cannot be returned to the resultBufPool.
+	resultBuf, err := encoding.DecompressZSTD(nil, compressedResultBuf.B)
+	if err != nil {
+		logger.Panicf("BUG: cannot decompress resultBuf from rollupResultCache: %s; it looks like it was improperly saved", err)
+	}
+	tss, err = unmarshalTimeseriesFast(resultBuf)
 	if err != nil {
 		logger.Panicf("BUG: cannot unmarshal timeseries from rollupResultCache: %s; it looks like it was improperly saved", err)
 	}
@@ -189,15 +219,17 @@ func (rrc *rollupResultCache) Get(funcName string, ec *EvalConfig, me *metricExp
 	return tss, newStart
 }

-func (rrc *rollupResultCache) Put(funcName string, ec *EvalConfig, me *metricExpr, window int64, tss []*timeseries) {
-	if len(tss) == 0 || !ec.mayCache() {
+var resultBufPool bytesutil.ByteBufferPool
+
+func (rrc *rollupResultCache) Put(ec *EvalConfig, expr metricsql.Expr, window int64, tss []*timeseries) {
+	if *disableCache || len(tss) == 0 || !ec.mayCache() {
 		return
 	}

-	// Remove values up to currentTime - step - maxSilenceInterval,
+	// Remove values up to currentTime - step - cacheTimestampOffset,
 	// since these values may be added later.
 	timestamps := tss[0].Timestamps
-	deadline := (time.Now().UnixNano() / 1e6) - ec.Step - maxSilenceInterval
+	deadline := (time.Now().UnixNano() / 1e6) - ec.Step - cacheTimestampOffset.Milliseconds()
 	i := len(timestamps) - 1
 	for i >= 0 && timestamps[i] > deadline {
 		i--
@@ -220,11 +252,16 @@ func (rrc *rollupResultCache) Put(funcName string, ec *EvalConfig, me *metricExp

 	// Store tss in the cache.
 	maxMarshaledSize := getRollupResultCacheSize() / 4
-	tssMarshaled := marshalTimeseriesFast(tss, maxMarshaledSize, ec.Step)
-	if tssMarshaled == nil {
+	resultBuf := resultBufPool.Get()
+	defer resultBufPool.Put(resultBuf)
+	resultBuf.B = marshalTimeseriesFast(resultBuf.B[:0], tss, maxMarshaledSize, ec.Step)
+	if len(resultBuf.B) == 0 {
 		tooBigRollupResults.Inc()
 		return
 	}
+	compressedResultBuf := resultBufPool.Get()
+	defer resultBufPool.Put(compressedResultBuf)
+	compressedResultBuf.B = encoding.CompressZSTDLevel(compressedResultBuf.B[:0], resultBuf.B, 1)

 	bb := bbPool.Get()
 	defer bbPool.Put(bb)
@@ -233,9 +270,9 @@ func (rrc *rollupResultCache) Put(funcName string, ec *EvalConfig, me *metricExp
 	key.prefix = rollupResultCacheKeyPrefix
 	key.suffix = atomic.AddUint64(&rollupResultCacheKeySuffix, 1)
 	bb.B = key.Marshal(bb.B[:0])
-	rrc.c.SetBig(bb.B, tssMarshaled)
+	rrc.c.SetBig(bb.B, compressedResultBuf.B)

-	bb.B = marshalRollupResultCacheKey(bb.B[:0], funcName, me, window, ec.Step)
+	bb.B = marshalRollupResultCacheKey(bb.B[:0], ec.AuthToken, expr, window, ec.Step)
 	metainfoBuf := rrc.c.Get(nil, bb.B)
 	var mi rollupResultCacheMetainfo
 	if len(metainfoBuf) > 0 {
@@ -263,17 +300,15 @@ var (
 var tooBigRollupResults = metrics.NewCounter("vm_too_big_rollup_results_total")

 // Increment this value every time the format of the cache changes.
-const rollupResultCacheVersion = 4
+const rollupResultCacheVersion = 7

-func marshalRollupResultCacheKey(dst []byte, funcName string, me *metricExpr, window, step int64) []byte {
+func marshalRollupResultCacheKey(dst []byte, at *auth.Token, expr metricsql.Expr, window, step int64) []byte {
 	dst = append(dst, rollupResultCacheVersion)
-	dst = encoding.MarshalUint64(dst, uint64(len(funcName)))
-	dst = append(dst, funcName...)
+	dst = encoding.MarshalUint32(dst, at.AccountID)
+	dst = encoding.MarshalUint32(dst, at.ProjectID)
 	dst = encoding.MarshalInt64(dst, window)
 	dst = encoding.MarshalInt64(dst, step)
-	for i := range me.TagFilters {
-		dst = me.TagFilters[i].Marshal(dst)
-	}
+	dst = expr.AppendString(dst)
 	return dst
 }

--- a/app/vmselect/promql/rollup_result_cache_test.go
+++ b/app/vmselect/promql/rollup_result_cache_test.go
@@ -3,30 +3,44 @@ package promql
 import (
 	"testing"

+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 )

 func TestRollupResultCache(t *testing.T) {
 	ResetRollupResultCache()
-	funcName := "foo"
 	window := int64(456)
 	ec := &EvalConfig{
 		Start: 1000,
 		End:   2000,
 		Step:  200,

+		AuthToken: &auth.Token{
+			AccountID: 333,
+			ProjectID: 843,
+		},
+
 		MayCache: true,
 	}
-	me := &metricExpr{
-		TagFilters: []storage.TagFilter{{
-			Key:   []byte("aaa"),
-			Value: []byte("xxx"),
+	me := &metricsql.MetricExpr{
+		LabelFilters: []metricsql.LabelFilter{{
+			Label: "aaa",
+			Value: "xxx",
 		}},
 	}
+	fe := &metricsql.FuncExpr{
+		Name: "foo",
+		Args: []metricsql.Expr{me},
+	}
+	ae := &metricsql.AggrFuncExpr{
+		Name: "foobar",
+		Args: []metricsql.Expr{fe},
+	}

 	// Try obtaining an empty value.
 	t.Run("empty", func(t *testing.T) {
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != ec.Start {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, ec.Start)
 		}
@@ -36,21 +50,42 @@ func TestRollupResultCache(t *testing.T) {
 	})

 	// Store timeseries overlapping with start
-	t.Run("start-overlap", func(t *testing.T) {
+	t.Run("start-overlap-no-ae", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{800, 1000, 1200},
 				Values:     []float64{0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1400 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1400)
 		}
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
+				Timestamps: []int64{1000, 1200},
+				Values:     []float64{1, 2},
+			},
+		}
+		testTimeseriesEqual(t, tss, tssExpected)
+	})
+	t.Run("start-overlap-with-ae", func(t *testing.T) {
+		ResetRollupResultCache()
+		tss := []*timeseries{
+			{
+				Timestamps: []int64{800, 1000, 1200},
+				Values:     []float64{0, 1, 2},
+			},
+		}
+		rollupResultCacheV.Put(ec, ae, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, ae, window)
+		if newStart != 1400 {
+			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1400)
+		}
+		tssExpected := []*timeseries{
+			{
 				Timestamps: []int64{1000, 1200},
 				Values:     []float64{1, 2},
 			},
@@ -62,13 +97,13 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("end-overlap", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1800, 2000, 2200, 2400},
 				Values:     []float64{333, 0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1000 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1000)
 		}
@@ -81,13 +116,13 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("full-cover", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1200, 1400, 1600},
 				Values:     []float64{0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1000 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1000)
 		}
@@ -100,13 +135,13 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("before-start", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{200, 400, 600},
 				Values:     []float64{0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1000 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1000)
 		}
@@ -119,13 +154,13 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("after-end", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{2200, 2400, 2600},
 				Values:     []float64{0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1000 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1000)
 		}
@@ -138,18 +173,18 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("bigger-than-start-end", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{800, 1000, 1200, 1400, 1600, 1800, 2000, 2200},
 				Values:     []float64{0, 1, 2, 3, 4, 5, 6, 7},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 2200 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 2200)
 		}
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{1, 2, 3, 4, 5, 6},
 			},
@@ -161,18 +196,18 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("start-end-match", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{1, 2, 3, 4, 5, 6},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 2200 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 2200)
 		}
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{1, 2, 3, 4, 5, 6},
 			},
@@ -191,8 +226,8 @@ func TestRollupResultCache(t *testing.T) {
 			}
 			tss = append(tss, ts)
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss)
-		tssResult, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss)
+		tssResult, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 2200 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 2200)
 		}
@@ -203,32 +238,32 @@ func TestRollupResultCache(t *testing.T) {
 	t.Run("multi-timeseries", func(t *testing.T) {
 		ResetRollupResultCache()
 		tss1 := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{800, 1000, 1200},
 				Values:     []float64{0, 1, 2},
 			},
 		}
 		tss2 := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1800, 2000, 2200, 2400},
 				Values:     []float64{333, 0, 1, 2},
 			},
 		}
 		tss3 := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1200, 1400, 1600},
 				Values:     []float64{0, 1, 2},
 			},
 		}
-		rollupResultCacheV.Put(funcName, ec, me, window, tss1)
-		rollupResultCacheV.Put(funcName, ec, me, window, tss2)
-		rollupResultCacheV.Put(funcName, ec, me, window, tss3)
-		tss, newStart := rollupResultCacheV.Get(funcName, ec, me, window)
+		rollupResultCacheV.Put(ec, fe, window, tss1)
+		rollupResultCacheV.Put(ec, fe, window, tss2)
+		rollupResultCacheV.Put(ec, fe, window, tss3)
+		tss, newStart := rollupResultCacheV.Get(ec, fe, window)
 		if newStart != 1400 {
 			t.Fatalf("unexpected newStart; got %d; want %d", newStart, 1400)
 		}
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200},
 				Values:     []float64{1, 2},
 			},
@@ -249,14 +284,14 @@ func TestMergeTimeseries(t *testing.T) {
 	t.Run("bStart=ec.Start", func(t *testing.T) {
 		a := []*timeseries{}
 		b := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{1, 2, 3, 4, 5, 6},
 			},
 		}
 		tss := mergeTimeseries(a, b, 1000, ec)
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{1, 2, 3, 4, 5, 6},
 			},
@@ -266,14 +301,14 @@ func TestMergeTimeseries(t *testing.T) {
 	t.Run("a-empty", func(t *testing.T) {
 		a := []*timeseries{}
 		b := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1400, 1600, 1800, 2000},
 				Values:     []float64{3, 4, 5, 6},
 			},
 		}
 		tss := mergeTimeseries(a, b, bStart, ec)
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{nan, nan, 3, 4, 5, 6},
 			},
@@ -282,7 +317,7 @@ func TestMergeTimeseries(t *testing.T) {
 	})
 	t.Run("b-empty", func(t *testing.T) {
 		a := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200},
 				Values:     []float64{2, 1},
 			},
@@ -290,7 +325,7 @@ func TestMergeTimeseries(t *testing.T) {
 		b := []*timeseries{}
 		tss := mergeTimeseries(a, b, bStart, ec)
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{2, 1, nan, nan, nan, nan},
 			},
@@ -299,20 +334,20 @@ func TestMergeTimeseries(t *testing.T) {
 	})
 	t.Run("non-empty", func(t *testing.T) {
 		a := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200},
 				Values:     []float64{2, 1},
 			},
 		}
 		b := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1400, 1600, 1800, 2000},
 				Values:     []float64{3, 4, 5, 6},
 			},
 		}
 		tss := mergeTimeseries(a, b, bStart, ec)
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{2, 1, 3, 4, 5, 6},
 			},
@@ -321,14 +356,14 @@ func TestMergeTimeseries(t *testing.T) {
 	})
 	t.Run("non-empty-distinct-metric-names", func(t *testing.T) {
 		a := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1000, 1200},
 				Values:     []float64{2, 1},
 			},
 		}
 		a[0].MetricName.MetricGroup = []byte("bar")
 		b := []*timeseries{
-			&timeseries{
+			{
 				Timestamps: []int64{1400, 1600, 1800, 2000},
 				Values:     []float64{3, 4, 5, 6},
 			},
@@ -336,14 +371,14 @@ func TestMergeTimeseries(t *testing.T) {
 		b[0].MetricName.MetricGroup = []byte("foo")
 		tss := mergeTimeseries(a, b, bStart, ec)
 		tssExpected := []*timeseries{
-			&timeseries{
+			{
 				MetricName: storage.MetricName{
 					MetricGroup: []byte("foo"),
 				},
 				Timestamps: []int64{1000, 1200, 1400, 1600, 1800, 2000},
 				Values:     []float64{nan, nan, 3, 4, 5, 6},
 			},
-			&timeseries{
+			{
 				MetricName: storage.MetricName{
 					MetricGroup: []byte("bar"),
 				},
@@ -362,7 +397,7 @@ func testTimeseriesEqual(t *testing.T, tss, tssExpected []*timeseries) {
 	}
 	for i, ts := range tss {
 		tsExpected := tssExpected[i]
-		testMetricNamesEqual(t, &ts.MetricName, &tsExpected.MetricName)
+		testMetricNamesEqual(t, &ts.MetricName, &tsExpected.MetricName, i)
 		testRowsEqual(t, ts.Values, ts.Timestamps, tsExpected.Values, tsExpected.Timestamps)
 	}
 }
--- a/app/vmselect/promql/rollup_test.go
+++ b/app/vmselect/promql/rollup_test.go
@@ -3,6 +3,8 @@ package promql
 import (
 	"math"
 	"testing"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
 )

 var (
@@ -10,6 +12,79 @@ var (
 	testTimestamps = []int64{5, 15, 24, 36, 49, 60, 78, 80, 97, 115, 120, 130}
 )

+func TestRollupIderivDuplicateTimestamps(t *testing.T) {
+	rfa := &rollupFuncArg{
+		values:     []float64{1, 2, 3, 4, 5},
+		timestamps: []int64{100, 100, 200, 300, 300},
+	}
+	n := rollupIderiv(rfa)
+	if n != 20 {
+		t.Fatalf("unexpected value; got %v; want %v", n, 20)
+	}
+
+	rfa = &rollupFuncArg{
+		values:     []float64{1, 2, 3, 4, 5},
+		timestamps: []int64{100, 100, 300, 300, 300},
+	}
+	n = rollupIderiv(rfa)
+	if n != 15 {
+		t.Fatalf("unexpected value; got %v; want %v", n, 15)
+	}
+
+	rfa = &rollupFuncArg{
+		prevValue:  nan,
+		values:     []float64{},
+		timestamps: []int64{},
+	}
+	n = rollupIderiv(rfa)
+	if !math.IsNaN(n) {
+		t.Fatalf("unexpected value; got %v; want %v", n, nan)
+	}
+
+	rfa = &rollupFuncArg{
+		prevValue:  nan,
+		values:     []float64{15},
+		timestamps: []int64{100},
+	}
+	n = rollupIderiv(rfa)
+	if !math.IsNaN(n) {
+		t.Fatalf("unexpected value; got %v; want %v", n, nan)
+	}
+
+	rfa = &rollupFuncArg{
+		prevTimestamp: 90,
+		prevValue:     10,
+		values:        []float64{15},
+		timestamps:    []int64{100},
+	}
+	n = rollupIderiv(rfa)
+	if n != 500 {
+		t.Fatalf("unexpected value; got %v; want %v", n, 500)
+	}
+
+	rfa = &rollupFuncArg{
+		prevTimestamp: 100,
+		prevValue:     10,
+		values:        []float64{15},
+		timestamps:    []int64{100},
+	}
+	n = rollupIderiv(rfa)
+	if n != inf {
+		t.Fatalf("unexpected value; got %v; want %v", n, inf)
+	}
+
+	rfa = &rollupFuncArg{
+		prevTimestamp: 100,
+		prevValue:     10,
+		values:        []float64{15, 20},
+		timestamps:    []int64{100, 100},
+	}
+	n = rollupIderiv(rfa)
+	if n != inf {
+		t.Fatalf("unexpected value; got %v; want %v", n, inf)
+	}
+}
+
 func TestRemoveCounterResets(t *testing.T) {
 	removeCounterResets(nil)

@@ -38,19 +113,19 @@ func TestDeltaValues(t *testing.T) {

 	values := []float64{123}
 	deltaValues(values)
-	valuesExpected := []float64{nan}
+	valuesExpected := []float64{0}
 	testRowsEqual(t, values, testTimestamps[:1], valuesExpected, testTimestamps[:1])

 	values = append([]float64{}, testValues...)
 	deltaValues(values)
-	valuesExpected = []float64{-89, 10, -23, 33, -20, 65, -87, 32, -12, 2, 0, nan}
+	valuesExpected = []float64{-89, 10, -23, 33, -20, 65, -87, 32, -12, 2, 0, 0}
 	testRowsEqual(t, values, testTimestamps, valuesExpected, testTimestamps)

 	// remove counter resets
 	values = append([]float64{}, testValues...)
 	removeCounterResets(values)
 	deltaValues(values)
-	valuesExpected = []float64{34, 10, 21, 33, 34, 65, 12, 32, 32, 2, 0, nan}
+	valuesExpected = []float64{34, 10, 21, 33, 34, 65, 12, 32, 32, 2, 0, 0}
 	testRowsEqual(t, values, testTimestamps, valuesExpected, testTimestamps)
 }

@@ -59,13 +134,13 @@ func TestDerivValues(t *testing.T) {

 	values := []float64{123}
 	derivValues(values, testTimestamps[:1])
-	valuesExpected := []float64{nan}
+	valuesExpected := []float64{0}
 	testRowsEqual(t, values, testTimestamps[:1], valuesExpected, testTimestamps[:1])

 	values = append([]float64{}, testValues...)
 	derivValues(values, testTimestamps)
 	valuesExpected = []float64{-8900, 1111.111111111111, -1916.6666666666665, 2538.461538461538, -1818.1818181818182, 3611.111111111111,
-		-43500, 1882.3529411764705, -666.6666666666666, 400, 0, nan}
+		-43500, 1882.3529411764705, -666.6666666666666, 400, 0, 0}
 	testRowsEqual(t, values, testTimestamps, valuesExpected, testTimestamps)

 	// remove counter resets
@@ -73,11 +148,18 @@ func TestDerivValues(t *testing.T) {
 	removeCounterResets(values)
 	derivValues(values, testTimestamps)
 	valuesExpected = []float64{3400, 1111.111111111111, 1750, 2538.461538461538, 3090.909090909091, 3611.111111111111,
-		6000, 1882.3529411764705, 1777.7777777777776, 400, 0, nan}
+		6000, 1882.3529411764705, 1777.7777777777776, 400, 0, 0}
 	testRowsEqual(t, values, testTimestamps, valuesExpected, testTimestamps)
+
+	// duplicate timestamps
+	values = []float64{1, 2, 3, 4, 5, 6, 7}
+	timestamps := []int64{100, 100, 200, 200, 300, 400, 400}
+	derivValues(values, timestamps)
+	valuesExpected = []float64{0, 20, 20, 20, 10, 10, 10}
+	testRowsEqual(t, values, timestamps, valuesExpected, timestamps)
 }

-func testRollupFunc(t *testing.T, funcName string, args []interface{}, meExpected *metricExpr, vExpected float64) {
+func testRollupFunc(t *testing.T, funcName string, args []interface{}, meExpected *metricsql.MetricExpr, vExpected float64) {
 	t.Helper()
 	nrf := getRollupFunc(funcName)
 	if nrf == nil {
@@ -102,13 +184,60 @@ func testRollupFunc(t *testing.T, funcName string, args []interface{}, meExpecte
 				t.Fatalf("unexpected value; got %v; want %v", v, vExpected)
 			}
 		} else {
-			if v != vExpected {
+			eps := math.Abs(v - vExpected)
+			if eps > 1e-14 {
 				t.Fatalf("unexpected value; got %v; want %v", v, vExpected)
 			}
 		}
 	}
 }

+func TestRollupShareLEOverTime(t *testing.T) {
+	f := func(le, vExpected float64) {
+		t.Helper()
+		les := []*timeseries{{
+			Values:     []float64{le},
+			Timestamps: []int64{123},
+		}}
+		var me metricsql.MetricExpr
+		args := []interface{}{&metricsql.RollupExpr{Expr: &me}, les}
+		testRollupFunc(t, "share_le_over_time", args, &me, vExpected)
+	}
+
+	f(-123, 0)
+	f(0, 0)
+	f(10, 0)
+	f(12, 0.08333333333333333)
+	f(30, 0.16666666666666666)
+	f(50, 0.75)
+	f(100, 0.9166666666666666)
+	f(123, 1)
+	f(1000, 1)
+}
+
+func TestRollupShareGTOverTime(t *testing.T) {
+	f := func(gt, vExpected float64) {
+		t.Helper()
+		gts := []*timeseries{{
+			Values:     []float64{gt},
+			Timestamps: []int64{123},
+		}}
+		var me metricsql.MetricExpr
+		args := []interface{}{&metricsql.RollupExpr{Expr: &me}, gts}
+		testRollupFunc(t, "share_gt_over_time", args, &me, vExpected)
+	}
+
+	f(-123, 1)
+	f(0, 1)
+	f(10, 1)
+	f(12, 0.9166666666666666)
+	f(30, 0.8333333333333334)
+	f(50, 0.25)
+	f(100, 0.08333333333333333)
+	f(123, 0)
+	f(1000, 0)
+}
+
 func TestRollupQuantileOverTime(t *testing.T) {
 	f := func(phi, vExpected float64) {
 		t.Helper()
@@ -116,8 +245,8 @@ func TestRollupQuantileOverTime(t *testing.T) {
 			Values:     []float64{phi},
 			Timestamps: []int64{123},
 		}}
-		var me metricExpr
-		args := []interface{}{phis, &rollupExpr{Expr: &me}}
+		var me metricsql.MetricExpr
+		args := []interface{}{phis, &metricsql.RollupExpr{Expr: &me}}
 		testRollupFunc(t, "quantile_over_time", args, &me, vExpected)
 	}

@@ -138,15 +267,15 @@ func TestRollupPredictLinear(t *testing.T) {
 			Values:     []float64{sec},
 			Timestamps: []int64{123},
 		}}
-		var me metricExpr
-		args := []interface{}{&rollupExpr{Expr: &me}, secs}
+		var me metricsql.MetricExpr
+		args := []interface{}{&metricsql.RollupExpr{Expr: &me}, secs}
 		testRollupFunc(t, "predict_linear", args, &me, vExpected)
 	}

-	f(0e-3, 63.739757761102624)
-	f(50e-3, 50.39682764539959)
-	f(100e-3, 37.053897529696556)
-	f(200e-3, 10.368037298290488)
+	f(0e-3, 30.382432471845043)
+	f(50e-3, 17.03950235614201)
+	f(100e-3, 3.696572240438975)
+	f(200e-3, -22.989287990967092)
 }

 func TestRollupHoltWinters(t *testing.T) {
@@ -160,8 +289,8 @@ func TestRollupHoltWinters(t *testing.T) {
 			Values:     []float64{tf},
 			Timestamps: []int64{123},
 		}}
-		var me metricExpr
-		args := []interface{}{&rollupExpr{Expr: &me}, sfs, tfs}
+		var me metricsql.MetricExpr
+		args := []interface{}{&metricsql.RollupExpr{Expr: &me}, sfs, tfs}
 		testRollupFunc(t, "holt_winters", args, &me, vExpected)
 	}

@@ -181,33 +310,85 @@ func TestRollupHoltWinters(t *testing.T) {
 	f(0.9, 0.9, 33.99637566941818)
 }

+func TestRollupHoeffdingBoundLower(t *testing.T) {
+	f := func(phi, vExpected float64) {
+		t.Helper()
+		phis := []*timeseries{{
+			Values:     []float64{phi},
+			Timestamps: []int64{123},
+		}}
+		var me metricsql.MetricExpr
+		args := []interface{}{phis, &metricsql.RollupExpr{Expr: &me}}
+		testRollupFunc(t, "hoeffding_bound_lower", args, &me, vExpected)
+	}
+
+	f(0.5, 28.21949401521037)
+	f(-1, 47.083333333333336)
+	f(0, 47.083333333333336)
+	f(1, -inf)
+	f(2, -inf)
+	f(0.1, 39.72878000047643)
+	f(0.9, 12.701803086472331)
+}
+
+func TestRollupHoeffdingBoundUpper(t *testing.T) {
+	f := func(phi, vExpected float64) {
+		t.Helper()
+		phis := []*timeseries{{
+			Values:     []float64{phi},
+			Timestamps: []int64{123},
+		}}
+		var me metricsql.MetricExpr
+		args := []interface{}{phis, &metricsql.RollupExpr{Expr: &me}}
+		testRollupFunc(t, "hoeffding_bound_upper", args, &me, vExpected)
+	}
+
+	f(0.5, 65.9471726514563)
+	f(-1, 47.083333333333336)
+	f(0, 47.083333333333336)
+	f(1, inf)
+	f(2, inf)
+	f(0.1, 54.43788666619024)
+	f(0.9, 81.46486358019433)
+}
+
 func TestRollupNewRollupFuncSuccess(t *testing.T) {
 	f := func(funcName string, vExpected float64) {
 		t.Helper()
-		var me metricExpr
-		args := []interface{}{&rollupExpr{Expr: &me}}
+		var me metricsql.MetricExpr
+		args := []interface{}{&metricsql.RollupExpr{Expr: &me}}
 		testRollupFunc(t, funcName, args, &me, vExpected)
 	}

-	f("default_rollup", 123)
-	f("changes", 10)
-	f("delta", -89)
-	f("deriv", -712)
+	f("default_rollup", 34)
+	f("changes", 11)
+	f("delta", 34)
+	f("deriv", -266.85860231406065)
+	f("deriv_fast", -712)
 	f("idelta", 0)
-	f("increase", 275)
+	f("increase", 398)
 	f("irate", 0)
 	f("rate", 2200)
 	f("resets", 5)
+	f("range_over_time", 111)
 	f("avg_over_time", 47.083333333333336)
 	f("min_over_time", 12)
 	f("max_over_time", 123)
+	f("tmin_over_time", 0.08)
+	f("tmax_over_time", 0.005)
 	f("sum_over_time", 565)
+	f("sum2_over_time", 37951)
+	f("geomean_over_time", 39.33466603189148)
 	f("count_over_time", 12)
 	f("stddev_over_time", 30.752935722554287)
 	f("stdvar_over_time", 945.7430555555555)
 	f("first_over_time", 123)
 	f("last_over_time", 34)
-	f("integrate", 61.0275)
+	f("integrate", 5.4705)
+	f("distinct_over_time", 8)
+	f("ideriv", 0)
+	f("decreases_over_time", 5)
+	f("increases_over_time", 5)
 }

 func TestRollupNewRollupFuncError(t *testing.T) {
@@ -239,7 +420,7 @@ func TestRollupNewRollupFuncError(t *testing.T) {
 		Values:     []float64{321},
 		Timestamps: []int64{123},
 	}}
-	me := &metricExpr{}
+	me := &metricsql.MetricExpr{}
 	f("holt_winters", []interface{}{123, 123, 321})
 	f("holt_winters", []interface{}{me, 123, 321})
 	f("holt_winters", []interface{}{me, scalarTs, 321})
@@ -259,7 +440,7 @@ func TestRollupNoWindowNoPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{nan, nan, nan, nan, 123}
+		valuesExpected := []float64{nan, nan, nan, nan, nan}
 		timestampsExpected := []int64{0, 1, 2, 3, 4}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -267,14 +448,14 @@ func TestRollupNoWindowNoPoints(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupDelta,
 			Start:  120,
-			End:    144,
+			End:    148,
 			Step:   4,
 			Window: 0,
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{2, 2, 0, 0, 0, nan, nan}
-		timestampsExpected := []int64{120, 124, 128, 132, 136, 140, 144}
+		valuesExpected := []float64{2, 0, 0, 0, nan, nan, nan, nan}
+		timestampsExpected := []int64{120, 124, 128, 132, 136, 140, 144, 148}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 }
@@ -290,22 +471,22 @@ func TestRollupWindowNoPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{nan, nan, nan, nan, 123}
+		valuesExpected := []float64{nan, nan, nan, nan, nan}
 		timestampsExpected := []int64{0, 1, 2, 3, 4}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 	t.Run("afterEnd", func(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupFirst,
-			Start:  141,
-			End:    171,
+			Start:  161,
+			End:    191,
 			Step:   10,
 			Window: 3,
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{34, nan, nan, nan}
-		timestampsExpected := []int64{141, 151, 161, 171}
+		valuesExpected := []float64{nan, nan, nan, nan}
+		timestampsExpected := []int64{161, 171, 181, 191}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 }
@@ -315,14 +496,14 @@ func TestRollupNoWindowPartialPoints(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupFirst,
 			Start:  0,
-			End:    20,
+			End:    25,
 			Step:   5,
 			Window: 0,
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{123, 123, 123, 123, 123}
-		timestampsExpected := []int64{0, 5, 10, 15, 20}
+		valuesExpected := []float64{nan, 123, nan, 34, nan, 44}
+		timestampsExpected := []int64{0, 5, 10, 15, 20, 25}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 	t.Run("afterEnd", func(t *testing.T) {
@@ -335,7 +516,7 @@ func TestRollupNoWindowPartialPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{44, 34, 34, nan}
+		valuesExpected := []float64{44, 32, 34, nan}
 		timestampsExpected := []int64{100, 120, 140, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -349,7 +530,7 @@ func TestRollupNoWindowPartialPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{nan, 123, 54, 44, nan}
+		valuesExpected := []float64{nan, nan, 123, 34, 32}
 		timestampsExpected := []int64{-50, 0, 50, 100, 150}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -366,7 +547,7 @@ func TestRollupWindowPartialPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{123, 123, 34, 34, 44}
+		valuesExpected := []float64{nan, 123, 123, 34, 34}
 		timestampsExpected := []int64{0, 5, 10, 15, 20}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -380,7 +561,7 @@ func TestRollupWindowPartialPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{34, 34, nan, nan}
+		valuesExpected := []float64{44, 34, 34, nan}
 		timestampsExpected := []int64{100, 120, 140, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -394,12 +575,57 @@ func TestRollupWindowPartialPoints(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{54, 44, nan, nan}
+		valuesExpected := []float64{nan, 54, 44, nan}
 		timestampsExpected := []int64{0, 50, 100, 150}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 }

+func TestRollupFuncsLookbackDelta(t *testing.T) {
+	t.Run("1", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:          rollupFirst,
+			Start:         80,
+			End:           140,
+			Step:          10,
+			LookbackDelta: 1,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{99, nan, 44, nan, 32, 34, nan}
+		timestampsExpected := []int64{80, 90, 100, 110, 120, 130, 140}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("7", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:          rollupFirst,
+			Start:         80,
+			End:           140,
+			Step:          10,
+			LookbackDelta: 7,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{99, nan, 44, nan, 32, 34, nan}
+		timestampsExpected := []int64{80, 90, 100, 110, 120, 130, 140}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("0", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:          rollupFirst,
+			Start:         80,
+			End:           140,
+			Step:          10,
+			LookbackDelta: 0,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{99, nan, 44, nan, 32, 34, nan}
+		timestampsExpected := []int64{80, 90, 100, 110, 120, 130, 140}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+}
+
 func TestRollupFuncsNoWindow(t *testing.T) {
 	t.Run("first", func(t *testing.T) {
 		rc := rollupConfig{
@@ -411,7 +637,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{123, 21, 12, 34, nan}
+		valuesExpected := []float64{nan, 123, 54, 44, 34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -425,7 +651,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{4, 4, 3, 1, nan}
+		valuesExpected := []float64{nan, 4, 4, 3, 1}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -439,7 +665,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{21, 12, 32, 34, nan}
+		valuesExpected := []float64{nan, 21, 12, 32, 34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -453,7 +679,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{123, 99, 44, 34, nan}
+		valuesExpected := []float64{nan, 123, 99, 44, 34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -467,7 +693,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{222, 199, 110, 34, nan}
+		valuesExpected := []float64{nan, 222, 199, 110, 34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -481,7 +707,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{-102, -9, 22, 0, nan}
+		valuesExpected := []float64{nan, nan, -9, 22, 0}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -495,10 +721,80 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{33, -87, 0, nan}
+		valuesExpected := []float64{123, 33, -87, 0}
 		timestampsExpected := []int64{10, 50, 90, 130}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
+	t.Run("lag", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupLag,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 0,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 0.004, 0, 0, 0.03}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("lifetime_1", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupLifetime,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 0,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 0.031, 0.044, 0.04, 0.01}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("lifetime_2", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupLifetime,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 200,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 0.031, 0.075, 0.115, 0.125}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("scrape_interval_1", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupScrapeInterval,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 0,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 0.010333333333333333, 0.011, 0.013333333333333334, 0.01}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+	t.Run("scrape_interval_2", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupScrapeInterval,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 80,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 0.010333333333333333, 0.010714285714285714, 0.012, 0.0125}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
 	t.Run("changes", func(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupChanges,
@@ -509,10 +805,24 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{3, 4, 3, 0, nan}
+		valuesExpected := []float64{nan, 4, 4, 3, 0}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
+	t.Run("changes_small_window", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupChanges,
+			Start:  0,
+			End:    45,
+			Step:   9,
+			Window: 9,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 1, 1, 1, 1, 0}
+		timestampsExpected := []int64{0, 9, 18, 27, 36, 45}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
 	t.Run("resets", func(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupResets,
@@ -523,7 +833,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{2, 2, 1, 0, nan}
+		valuesExpected := []float64{nan, 2, 2, 1, 0}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -537,13 +847,13 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{55.5, 49.75, 36.666666666666664, 34, nan}
+		valuesExpected := []float64{nan, 55.5, 49.75, 36.666666666666664, 34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
 	t.Run("deriv", func(t *testing.T) {
 		rc := rollupConfig{
-			Func:   rollupDeriv,
+			Func:   rollupDerivSlow,
 			Start:  0,
 			End:    160,
 			Step:   40,
@@ -551,10 +861,24 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{-3290.3225806451615, -204.54545454545456, 550, 0, nan}
+		valuesExpected := []float64{0, -2879.310344827587, 558.0608793686592, 422.84569138276544, 0}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
+	t.Run("deriv_fast", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupDerivFast,
+			Start:  0,
+			End:    20,
+			Step:   4,
+			Window: 0,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, nan, nan, 0, -8900, 0}
+		timestampsExpected := []int64{0, 4, 8, 12, 16, 20}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
 	t.Run("ideriv", func(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupIderiv,
@@ -565,7 +889,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{-1916.6666666666665, -43500, 400, 0, nan}
+		valuesExpected := []float64{nan, -1916.6666666666665, -43500, 400, 0}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -579,7 +903,7 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{39.81519810323691, 32.080952292598795, 5.2493385826745405, 0, nan}
+		valuesExpected := []float64{nan, 39.81519810323691, 32.080952292598795, 5.2493385826745405, 5.830951894845301}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
@@ -593,11 +917,11 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{4.6035, 4.3934999999999995, 2.166, 0.34, nan}
+		valuesExpected := []float64{nan, 1.526, 2.2795, 1.325, 0.34}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
-	t.Run("distinct", func(t *testing.T) {
+	t.Run("distinct_over_time_1", func(t *testing.T) {
 		rc := rollupConfig{
 			Func:   rollupDistinct,
 			Start:  0,
@@ -607,10 +931,45 @@ func TestRollupFuncsNoWindow(t *testing.T) {
 		}
 		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
 		values := rc.Do(nil, testValues, testTimestamps)
-		valuesExpected := []float64{4, 4, 3, 1, nan}
+		valuesExpected := []float64{nan, 4, 4, 3, 1}
 		timestampsExpected := []int64{0, 40, 80, 120, 160}
 		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 	})
+	t.Run("distinct_over_time_2", func(t *testing.T) {
+		rc := rollupConfig{
+			Func:   rollupDistinct,
+			Start:  0,
+			End:    160,
+			Step:   40,
+			Window: 80,
+		}
+		rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+		values := rc.Do(nil, testValues, testTimestamps)
+		valuesExpected := []float64{nan, 4, 7, 6, 3}
+		timestampsExpected := []int64{0, 40, 80, 120, 160}
+		testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
+	})
+}
+
+func TestRollupBigNumberOfValues(t *testing.T) {
+	const srcValuesCount = 1e4
+	rc := rollupConfig{
+		Func:   rollupDefault,
+		End:    srcValuesCount,
+		Step:   srcValuesCount / 5,
+		Window: srcValuesCount / 4,
+	}
+	rc.Timestamps = getTimestamps(rc.Start, rc.End, rc.Step)
+	srcValues := make([]float64, srcValuesCount)
+	srcTimestamps := make([]int64, srcValuesCount)
+	for i := 0; i < srcValuesCount; i++ {
+		srcValues[i] = float64(i)
+		srcTimestamps[i] = int64(i / 2)
+	}
+	values := rc.Do(nil, srcValues, srcTimestamps)
+	valuesExpected := []float64{1, 4001, 8001, 9999, nan, nan}
+	timestampsExpected := []int64{0, 2000, 4000, 6000, 8000, 10000}
+	testRowsEqual(t, values, rc.Timestamps, valuesExpected, timestampsExpected)
 }

 func testRowsEqual(t *testing.T, values []float64, timestamps []int64, valuesExpected []float64, timestampsExpected []int64) {
@@ -641,7 +1000,7 @@ func testRowsEqual(t *testing.T, values []float64, timestamps []int64, valuesExp
 			}
 			continue
 		}
-		if v != vExpected {
+		if math.Abs(v-vExpected) > 1e-15 {
 			t.Fatalf("unexpected value at values[%d]; got %f; want %f\nvalues=\n%v\nvaluesExpected=\n%v",
 				i, v, vExpected, values, valuesExpected)
 		}
--- a/app/vmselect/promql/timeseries.go
+++ b/app/vmselect/promql/timeseries.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"sort"
 	"strconv"
+	"sync"
 	"unsafe"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
@@ -38,11 +39,13 @@ func (ts *timeseries) String() string {
 	return fmt.Sprintf("MetricName=%s, Values=%g, Timestamps=%d", &ts.MetricName, ts.Values, ts.Timestamps)
 }

-func (ts *timeseries) CopyFrom(src *timeseries) {
+func (ts *timeseries) CopyFromShallowTimestamps(src *timeseries) {
 	ts.Reset()
 	ts.MetricName.CopyFrom(&src.MetricName)
 	ts.Values = append(ts.Values[:0], src.Values...)
-	ts.Timestamps = append(ts.Timestamps[:0], src.Timestamps...)
+	ts.Timestamps = src.Timestamps
+
+	ts.denyReuse = true
 }

 func (ts *timeseries) CopyFromMetricNames(src *timeseries) {
@@ -59,7 +62,21 @@ func (ts *timeseries) CopyShallow(src *timeseries) {
 	ts.denyReuse = true
 }

-func marshalTimeseriesFast(tss []*timeseries, maxSize int, step int64) []byte {
+func getTimeseries() *timeseries {
+	if v := timeseriesPool.Get(); v != nil {
+		return v.(*timeseries)
+	}
+	return &timeseries{}
+}
+
+func putTimeseries(ts *timeseries) {
+	ts.Reset()
+	timeseriesPool.Put(ts)
+}
+
+var timeseriesPool sync.Pool
+
+func marshalTimeseriesFast(dst []byte, tss []*timeseries, maxSize int, step int64) []byte {
 	if len(tss) == 0 {
 		logger.Panicf("BUG: tss cannot be empty")
 	}
@@ -75,13 +92,13 @@ func marshalTimeseriesFast(tss []*timeseries, maxSize int, step int64) []byte {

 	if size > maxSize {
 		// Do not marshal tss, since it would occupy too much space
-		return nil
+		return dst
 	}

 	// Allocate the buffer for the marshaled tss before its' marshaling.
 	// This should reduce memory fragmentation and memory usage.
-	dst := make([]byte, 0, size)
-	dst = marshalFastTimestamps(dst, tss[0].Timestamps)
+	dst = bytesutil.Resize(dst, size)
+	dst = marshalFastTimestamps(dst[:0], tss[0].Timestamps)
 	for _, ts := range tss {
 		dst = ts.marshalFastNoTimestamps(dst)
 	}
--- a/app/vmselect/promql/timeseries_test.go
+++ b/app/vmselect/promql/timeseries_test.go
@@ -74,7 +74,7 @@ func TestTimeseriesMarshalUnmarshalFast(t *testing.T) {

 			tssOrig = append(tssOrig, &ts)
 		}
-		buf := marshalTimeseriesFast(tssOrig, 1e6, 123)
+		buf := marshalTimeseriesFast(nil, tssOrig, 1e6, 123)
 		tssGot, err := unmarshalTimeseriesFast(buf)
 		if err != nil {
 			t.Fatalf("error in unmarshalTimeseriesFast: %s", err)
--- a/app/vmselect/promql/transform.go
+++ b/app/vmselect/promql/transform.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"math"
 	"math/rand"
+	"regexp"
 	"sort"
 	"strconv"
 	"strings"
@@ -11,6 +12,7 @@ import (

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/metricsql"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
 	"github.com/valyala/histogram"
 )
@@ -61,9 +63,14 @@ var transformFuncs = map[string]transformFunc{
 	"label_keep":         transformLabelKeep,
 	"label_copy":         transformLabelCopy,
 	"label_move":         transformLabelMove,
+	"label_transform":    transformLabelTransform,
+	"label_value":        transformLabelValue,
+	"label_match":        transformLabelMatch,
+	"label_mismatch":     transformLabelMismatch,
 	"union":              transformUnion,
 	"":                   transformUnion, // empty func is a synonim to union
 	"keep_last_value":    transformKeepLastValue,
+	"keep_next_value":    transformKeepNextValue,
 	"start":              newTransformFuncZeroArgs(transformStart),
 	"end":                newTransformFuncZeroArgs(transformEnd),
 	"step":               newTransformFuncZeroArgs(transformStep),
@@ -88,6 +95,10 @@ var transformFuncs = map[string]transformFunc{
 	"cos":                newTransformFuncOneArg(transformCos),
 	"asin":               newTransformFuncOneArg(transformAsin),
 	"acos":               newTransformFuncOneArg(transformAcos),
+	"prometheus_buckets": transformPrometheusBuckets,
+	"histogram_share":    transformHistogramShare,
+	"sort_by_label":      newTransformFuncSortByLabel(false),
+	"sort_by_label_desc": newTransformFuncSortByLabel(true),
 }

 func getTransformFunc(s string) transformFunc {
@@ -95,13 +106,9 @@ func getTransformFunc(s string) transformFunc {
 	return transformFuncs[s]
 }

-func isTransformFunc(s string) bool {
-	return getTransformFunc(s) != nil
-}
-
 type transformFuncArg struct {
 	ec   *EvalConfig
-	fe   *funcExpr
+	fe   *metricsql.FuncExpr
 	args [][]*timeseries
 }

@@ -122,8 +129,9 @@ func newTransformFuncOneArg(tf func(v float64) float64) transformFunc {
 	}
 }

-func doTransformValues(arg []*timeseries, tf func(values []float64), fe *funcExpr) ([]*timeseries, error) {
-	keepMetricGroup := transformFuncsKeepMetricGroup[fe.Name]
+func doTransformValues(arg []*timeseries, tf func(values []float64), fe *metricsql.FuncExpr) ([]*timeseries, error) {
+	name := strings.ToLower(fe.Name)
+	keepMetricGroup := transformFuncsKeepMetricGroup[name]
 	for _, ts := range arg {
 		if !keepMetricGroup {
 			ts.MetricName.ResetMetricGroup()
@@ -144,28 +152,10 @@ func transformAbsent(tfa *transformFuncArg) ([]*timeseries, error) {
 		return nil, err
 	}
 	arg := args[0]
-
 	if len(arg) == 0 {
-		// Copy tags from arg
-		rvs := evalNumber(tfa.ec, 1)
-		rv := rvs[0]
-		me, ok := tfa.fe.Args[0].(*metricExpr)
-		if !ok {
-			return rvs, nil
-		}
-		for i := range me.TagFilters {
-			tf := &me.TagFilters[i]
-			if len(tf.Key) == 0 {
-				continue
-			}
-			if tf.IsRegexp || tf.IsNegative {
-				continue
-			}
-			rv.MetricName.AddTagBytes(tf.Key, tf.Value)
-		}
+		rvs := getAbsentTimeseries(tfa.ec, tfa.fe.Args[0])
 		return rvs, nil
 	}
-
 	for _, ts := range arg {
 		ts.MetricName.ResetMetricGroup()
 		for i, v := range ts.Values {
@@ -180,6 +170,28 @@ func transformAbsent(tfa *transformFuncArg) ([]*timeseries, error) {
 	return arg, nil
 }

+func getAbsentTimeseries(ec *EvalConfig, arg metricsql.Expr) []*timeseries {
+	// Copy tags from arg
+	rvs := evalNumber(ec, 1)
+	rv := rvs[0]
+	me, ok := arg.(*metricsql.MetricExpr)
+	if !ok {
+		return rvs
+	}
+	tfs := toTagFilters(me.LabelFilters)
+	for i := range tfs {
+		tf := &tfs[i]
+		if len(tf.Key) == 0 {
+			continue
+		}
+		if tf.IsRegexp || tf.IsNegative {
+			continue
+		}
+		rv.MetricName.AddTagBytes(tf.Key, tf.Value)
+	}
+	return rvs
+}
+
 func transformCeil(v float64) float64 {
 	return math.Ceil(v)
 }
@@ -268,24 +280,364 @@ func transformFloor(v float64) float64 {
 	return math.Floor(v)
 }

-func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
+func transformPrometheusBuckets(tfa *transformFuncArg) ([]*timeseries, error) {
 	args := tfa.args
-	if err := expectTransformArgsNum(args, 2); err != nil {
-		return nil, err
-	}
-	phis, err := getScalar(args[0], 0)
-	if err != nil {
+	if err := expectTransformArgsNum(args, 1); err != nil {
 		return nil, err
 	}
+	rvs := vmrangeBucketsToLE(args[0])
+	return rvs, nil
+}

-	// Group metrics by all tags excluding "le"
+func vmrangeBucketsToLE(tss []*timeseries) []*timeseries {
+	rvs := make([]*timeseries, 0, len(tss))
+
+	// Group timeseries by MetricGroup+tags excluding `vmrange` tag.
 	type x struct {
-		le float64
-		ts *timeseries
+		startStr string
+		endStr   string
+		start    float64
+		end      float64
+		ts       *timeseries
 	}
 	m := make(map[string][]x)
 	bb := bbPool.Get()
-	for _, ts := range args[1] {
+	defer bbPool.Put(bb)
+	for _, ts := range tss {
+		vmrange := ts.MetricName.GetTagValue("vmrange")
+		if len(vmrange) == 0 {
+			if le := ts.MetricName.GetTagValue("le"); len(le) > 0 {
+				// Keep Prometheus-compatible buckets.
+				rvs = append(rvs, ts)
+			}
+			continue
+		}
+		n := strings.Index(bytesutil.ToUnsafeString(vmrange), "...")
+		if n < 0 {
+			continue
+		}
+		startStr := string(vmrange[:n])
+		start, err := strconv.ParseFloat(startStr, 64)
+		if err != nil {
+			continue
+		}
+		endStr := string(vmrange[n+len("..."):])
+		end, err := strconv.ParseFloat(endStr, 64)
+		if err != nil {
+			continue
+		}
+		ts.MetricName.RemoveTag("le")
+		ts.MetricName.RemoveTag("vmrange")
+		bb.B = marshalMetricNameSorted(bb.B[:0], &ts.MetricName)
+		m[string(bb.B)] = append(m[string(bb.B)], x{
+			startStr: startStr,
+			endStr:   endStr,
+			start:    start,
+			end:      end,
+			ts:       ts,
+		})
+	}
+
+	// Convert `vmrange` label in each group of time series to `le` label.
+	copyTS := func(src *timeseries, leStr string) *timeseries {
+		var ts timeseries
+		ts.CopyFromShallowTimestamps(src)
+		values := ts.Values
+		for i := range values {
+			values[i] = 0
+		}
+		ts.MetricName.RemoveTag("le")
+		ts.MetricName.AddTag("le", leStr)
+		return &ts
+	}
+	isZeroTS := func(ts *timeseries) bool {
+		for _, v := range ts.Values {
+			if v > 0 {
+				return false
+			}
+		}
+		return true
+	}
+	for _, xss := range m {
+		sort.Slice(xss, func(i, j int) bool { return xss[i].end < xss[j].end })
+		xssNew := make([]x, 0, len(xss)+2)
+		var xsPrev x
+		for _, xs := range xss {
+			ts := xs.ts
+			if isZeroTS(ts) {
+				// Skip time series with zeros. They are substituted by xssNew below.
+				xsPrev = xs
+				continue
+			}
+			if xs.start != xsPrev.end {
+				xssNew = append(xssNew, x{
+					endStr: xs.startStr,
+					end:    xs.start,
+					ts:     copyTS(ts, xs.startStr),
+				})
+			}
+			ts.MetricName.AddTag("le", xs.endStr)
+			xssNew = append(xssNew, xs)
+			xsPrev = xs
+		}
+		if !math.IsInf(xsPrev.end, 1) {
+			xssNew = append(xssNew, x{
+				endStr: "+Inf",
+				end:    math.Inf(1),
+				ts:     copyTS(xsPrev.ts, "+Inf"),
+			})
+		}
+		xss = xssNew
+		for i := range xss[0].ts.Values {
+			count := float64(0)
+			for _, xs := range xss {
+				ts := xs.ts
+				v := ts.Values[i]
+				if !math.IsNaN(v) && v > 0 {
+					count += v
+				}
+				ts.Values[i] = count
+			}
+		}
+		for _, xs := range xss {
+			rvs = append(rvs, xs.ts)
+		}
+	}
+	return rvs
+}
+
+func transformHistogramShare(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if len(args) < 2 || len(args) > 3 {
+		return nil, fmt.Errorf("unexpected number of args; got %d; want 2...3", len(args))
+	}
+	les, err := getScalar(args[0], 0)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse le: %s", err)
+	}
+
+	// Convert buckets with `vmrange` labels to buckets with `le` labels.
+	tss := vmrangeBucketsToLE(args[1])
+
+	// Parse boundsLabel. See https://github.com/prometheus/prometheus/issues/5706 for details.
+	var boundsLabel string
+	if len(args) > 2 {
+		s, err := getString(args[2], 2)
+		if err != nil {
+			return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
+		}
+		boundsLabel = s
+	}
+
+	// Group metrics by all tags excluding "le"
+	m := groupLeTimeseries(tss)
+
+	// Calculate share for les
+
+	share := func(i int, les []float64, xss []leTimeseries) (q, lower, upper float64) {
+		leReq := les[i]
+		if math.IsNaN(leReq) || len(xss) == 0 {
+			return nan, nan, nan
+		}
+		fixBrokenBuckets(i, xss)
+		if leReq < 0 {
+			return 0, 0, 0
+		}
+		if math.IsInf(leReq, 1) {
+			return 1, 1, 1
+		}
+		var vPrev, lePrev float64
+		for _, xs := range xss {
+			v := xs.ts.Values[i]
+			le := xs.le
+			if leReq >= le {
+				vPrev = v
+				lePrev = le
+				continue
+			}
+			// precondition: lePrev <= leReq < le
+			vLast := xss[len(xss)-1].ts.Values[i]
+			lower = vPrev / vLast
+			if math.IsInf(le, 1) {
+				return lower, lower, 1
+			}
+			if lePrev == leReq {
+				return lower, lower, lower
+			}
+			upper = v / vLast
+			q = lower + (v-vPrev)/vLast*(leReq-lePrev)/(le-lePrev)
+			return q, lower, upper
+		}
+		// precondition: leReq > leLast
+		return 1, 1, 1
+	}
+	rvs := make([]*timeseries, 0, len(m))
+	for _, xss := range m {
+		sort.Slice(xss, func(i, j int) bool {
+			return xss[i].le < xss[j].le
+		})
+		dst := xss[0].ts
+		var tsLower, tsUpper *timeseries
+		if len(boundsLabel) > 0 {
+			tsLower = &timeseries{}
+			tsLower.CopyFromShallowTimestamps(dst)
+			tsLower.MetricName.RemoveTag(boundsLabel)
+			tsLower.MetricName.AddTag(boundsLabel, "lower")
+			tsUpper = &timeseries{}
+			tsUpper.CopyFromShallowTimestamps(dst)
+			tsUpper.MetricName.RemoveTag(boundsLabel)
+			tsUpper.MetricName.AddTag(boundsLabel, "upper")
+		}
+		for i := range dst.Values {
+			q, lower, upper := share(i, les, xss)
+			dst.Values[i] = q
+			if len(boundsLabel) > 0 {
+				tsLower.Values[i] = lower
+				tsUpper.Values[i] = upper
+			}
+		}
+		rvs = append(rvs, dst)
+		if len(boundsLabel) > 0 {
+			rvs = append(rvs, tsLower)
+			rvs = append(rvs, tsUpper)
+		}
+	}
+	return rvs, nil
+}
+
+func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if len(args) < 2 || len(args) > 3 {
+		return nil, fmt.Errorf("unexpected number of args; got %d; want 2...3", len(args))
+	}
+	phis, err := getScalar(args[0], 0)
+	if err != nil {
+		return nil, fmt.Errorf("cannot parse phi: %s", err)
+	}
+
+	// Convert buckets with `vmrange` labels to buckets with `le` labels.
+	tss := vmrangeBucketsToLE(args[1])
+
+	// Parse boundsLabel. See https://github.com/prometheus/prometheus/issues/5706 for details.
+	var boundsLabel string
+	if len(args) > 2 {
+		s, err := getString(args[2], 2)
+		if err != nil {
+			return nil, fmt.Errorf("cannot parse boundsLabel (arg #3): %s", err)
+		}
+		boundsLabel = s
+	}
+
+	// Group metrics by all tags excluding "le"
+	m := groupLeTimeseries(tss)
+
+	// Calculate quantile for each group in m
+
+	lastNonInf := func(i int, xss []leTimeseries) float64 {
+		for len(xss) > 0 {
+			xsLast := xss[len(xss)-1]
+			v := xsLast.ts.Values[i]
+			if v == 0 {
+				return nan
+			}
+			if !math.IsInf(xsLast.le, 0) {
+				return xsLast.le
+			}
+			xss = xss[:len(xss)-1]
+		}
+		return nan
+	}
+	quantile := func(i int, phis []float64, xss []leTimeseries) (q, lower, upper float64) {
+		phi := phis[i]
+		if math.IsNaN(phi) {
+			return nan, nan, nan
+		}
+		fixBrokenBuckets(i, xss)
+		vLast := float64(0)
+		if len(xss) > 0 {
+			vLast = xss[len(xss)-1].ts.Values[i]
+		}
+		if vLast == 0 {
+			return nan, nan, nan
+		}
+		if phi < 0 {
+			return -inf, -inf, xss[0].ts.Values[i]
+		}
+		if phi > 1 {
+			return inf, vLast, inf
+		}
+		vReq := vLast * phi
+		vPrev := float64(0)
+		lePrev := float64(0)
+		for _, xs := range xss {
+			v := xs.ts.Values[i]
+			le := xs.le
+			if v <= 0 {
+				// Skip zero buckets.
+				lePrev = le
+				continue
+			}
+			if v < vReq {
+				vPrev = v
+				lePrev = le
+				continue
+			}
+			if math.IsInf(le, 0) {
+				vv := lastNonInf(i, xss)
+				return vv, vv, inf
+			}
+			if v == vPrev {
+				return lePrev, lePrev, v
+			}
+			vv := lePrev + (le-lePrev)*(vReq-vPrev)/(v-vPrev)
+			return vv, lePrev, le
+		}
+		vv := lastNonInf(i, xss)
+		return vv, vv, inf
+	}
+	rvs := make([]*timeseries, 0, len(m))
+	for _, xss := range m {
+		sort.Slice(xss, func(i, j int) bool {
+			return xss[i].le < xss[j].le
+		})
+		dst := xss[0].ts
+		var tsLower, tsUpper *timeseries
+		if len(boundsLabel) > 0 {
+			tsLower = &timeseries{}
+			tsLower.CopyFromShallowTimestamps(dst)
+			tsLower.MetricName.RemoveTag(boundsLabel)
+			tsLower.MetricName.AddTag(boundsLabel, "lower")
+			tsUpper = &timeseries{}
+			tsUpper.CopyFromShallowTimestamps(dst)
+			tsUpper.MetricName.RemoveTag(boundsLabel)
+			tsUpper.MetricName.AddTag(boundsLabel, "upper")
+		}
+		for i := range dst.Values {
+			v, lower, upper := quantile(i, phis, xss)
+			dst.Values[i] = v
+			if len(boundsLabel) > 0 {
+				tsLower.Values[i] = lower
+				tsUpper.Values[i] = upper
+			}
+		}
+		rvs = append(rvs, dst)
+		if len(boundsLabel) > 0 {
+			rvs = append(rvs, tsLower)
+			rvs = append(rvs, tsUpper)
+		}
+	}
+	return rvs, nil
+}
+
+type leTimeseries struct {
+	le float64
+	ts *timeseries
+}
+
+func groupLeTimeseries(tss []*timeseries) map[string][]leTimeseries {
+	m := make(map[string][]leTimeseries)
+	bb := bbPool.Get()
+	for _, ts := range tss {
 		tagValue := ts.MetricName.GetTagValue("le")
 		if len(tagValue) == 0 {
 			continue
@@ -294,77 +646,31 @@ func transformHistogramQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
 		if err != nil {
 			continue
 		}
-		var dst timeseries
-		dst.CopyFrom(ts)
-		dst.MetricName.ResetMetricGroup()
-		dst.MetricName.RemoveTag("le")
-		bb.B = marshalMetricTagsSorted(bb.B[:0], &dst.MetricName)
-		m[string(bb.B)] = append(m[string(bb.B)], x{
+		ts.MetricName.ResetMetricGroup()
+		ts.MetricName.RemoveTag("le")
+		bb.B = marshalMetricTagsSorted(bb.B[:0], &ts.MetricName)
+		m[string(bb.B)] = append(m[string(bb.B)], leTimeseries{
 			le: le,
-			ts: &dst,
+			ts: ts,
 		})
 	}
 	bbPool.Put(bb)
+	return m
+}

-	// Calculate quantile for each group in m
-	lastNonInf := func(xss []x) float64 {
-		for len(xss) > 0 && math.IsInf(xss[len(xss)-1].le, 0) {
-			xss = xss[:len(xss)-1]
+func fixBrokenBuckets(i int, xss []leTimeseries) {
+	// Fix broken buckets.
+	// They are already sorted by le, so their values must be in ascending order,
+	// since the next bucket includes all the previous buckets.
+	vPrev := float64(0)
+	for _, xs := range xss {
+		v := xs.ts.Values[i]
+		if v < vPrev || math.IsNaN(v) {
+			xs.ts.Values[i] = vPrev
+		} else {
+			vPrev = v
 		}
-		if len(xss) == 0 {
-			return nan
-		}
-		return xss[len(xss)-1].le
 	}
-	quantile := func(i int, phis []float64, xss []x) float64 {
-		vPrev := float64(0)
-		lePrev := float64(0)
-		phi := phis[i]
-		if math.IsNaN(phi) {
-			return nan
-		}
-		if phi < 0 {
-			return -inf
-		}
-		if phi > 1 {
-			return inf
-		}
-		vReq := xss[len(xss)-1].ts.Values[i] * phi
-		for _, xs := range xss {
-			v := xs.ts.Values[i]
-			le := xs.le
-			if v <= vPrev {
-				v = vPrev
-				le = lePrev
-			}
-			if v < vReq {
-				vPrev = v
-				lePrev = le
-				continue
-			}
-			if math.IsInf(le, 0) {
-				return lastNonInf(xss)
-			}
-			if v == vPrev {
-				return lePrev
-			}
-			return lePrev + (le-lePrev)*(vReq-vPrev)/(v-vPrev)
-		}
-		return lastNonInf(xss)
-	}
-	var rvs []*timeseries
-	for _, xss := range m {
-		sort.Slice(xss, func(i, j int) bool {
-			return xss[i].le < xss[j].le
-		})
-		dst := xss[0].ts
-		for i := range dst.Values {
-			dst.Values[i] = quantile(i, phis, xss)
-		}
-		rvs = append(rvs, dst)
-	}
-
-	return rvs, nil
 }

 func transformHour(t time.Time) int {
@@ -394,13 +700,6 @@ func runningAvg(a, b float64, idx int) float64 {
 	return a + (b-a)/float64(idx+1)
 }

-func keepLastValue(a, b float64, idx int) float64 {
-	if math.IsNaN(b) {
-		return a
-	}
-	return b
-}
-
 func skipLeadingNaNs(values []float64) []float64 {
 	i := 0
 	for i < len(values) && math.IsNaN(values[i]) {
@@ -428,13 +727,37 @@ func transformKeepLastValue(tfa *transformFuncArg) ([]*timeseries, error) {
 		if len(values) == 0 {
 			continue
 		}
-		prevValue := values[0]
+		lastValue := values[0]
 		for i, v := range values {
-			if math.IsNaN(v) {
-				v = prevValue
+			if !math.IsNaN(v) {
+				lastValue = v
+				continue
 			}
-			values[i] = v
-			prevValue = v
+			values[i] = lastValue
+		}
+	}
+	return rvs, nil
+}
+
+func transformKeepNextValue(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if err := expectTransformArgsNum(args, 1); err != nil {
+		return nil, err
+	}
+	rvs := args[0]
+	for _, ts := range rvs {
+		values := ts.Values
+		if len(values) == 0 {
+			continue
+		}
+		nextValue := values[len(values)-1]
+		for i := len(values) - 1; i >= 0; i-- {
+			v := values[i]
+			if !math.IsNaN(v) {
+				nextValue = v
+				continue
+			}
+			values[i] = nextValue
 		}
 	}
 	return rvs, nil
@@ -499,10 +822,6 @@ func transformRangeQuantile(tfa *transformFuncArg) ([]*timeseries, error) {
 		hf.Reset()
 		lastIdx := -1
 		values := ts.Values
-		if len(values) > 0 {
-			// Ignore the last value. See Exec func for details.
-			values = values[:len(values)-1]
-		}
 		for i, v := range values {
 			if math.IsNaN(v) {
 				continue
@@ -553,14 +872,7 @@ func transformRangeLast(tfa *transformFuncArg) ([]*timeseries, error) {

 func setLastValues(tss []*timeseries) {
 	for _, ts := range tss {
-		values := ts.Values
-		if len(values) < 2 {
-			continue
-		}
-		// Do not take into account the last value, since it shouldn't be included
-		// in the range. See Exec func for details.
-		values = values[:len(values)-1]
-		values = skipTrailingNaNs(values)
+		values := skipTrailingNaNs(ts.Values)
 		if len(values) == 0 {
 			continue
 		}
@@ -641,9 +953,7 @@ func transformUnion(tfa *transformFuncArg) ([]*timeseries, error) {
 				continue
 			}
 			m[string(bb.B)] = true
-			var dst timeseries
-			dst.CopyFrom(ts)
-			rvs = append(rvs, &dst)
+			rvs = append(rvs, ts)
 		}
 	}
 	bbPool.Put(bb)
@@ -816,6 +1126,31 @@ func transformLabelJoin(tfa *transformFuncArg) ([]*timeseries, error) {
 	return rvs, nil
 }

+func transformLabelTransform(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if err := expectTransformArgsNum(args, 4); err != nil {
+		return nil, err
+	}
+	label, err := getString(args[1], 1)
+	if err != nil {
+		return nil, err
+	}
+	regex, err := getString(args[2], 2)
+	if err != nil {
+		return nil, err
+	}
+	replacement, err := getString(args[3], 3)
+	if err != nil {
+		return nil, err
+	}
+
+	r, err := metricsql.CompileRegexp(regex)
+	if err != nil {
+		return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
+	}
+	return labelReplace(args[0], label, r, label, replacement)
+}
+
 func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
 	args := tfa.args
 	if err := expectTransformArgsNum(args, 5); err != nil {
@@ -838,15 +1173,16 @@ func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
 		return nil, err
 	}

-	r, err := compileRegexpAnchored(regex)
+	r, err := metricsql.CompileRegexpAnchored(regex)
 	if err != nil {
 		return nil, fmt.Errorf(`cannot compile regex %q: %s`, regex, err)
 	}
+	return labelReplace(args[0], srcLabel, r, dstLabel, replacement)
+}

+func labelReplace(tss []*timeseries, srcLabel string, r *regexp.Regexp, dstLabel, replacement string) ([]*timeseries, error) {
 	replacementBytes := []byte(replacement)
-
-	rvs := args[0]
-	for _, ts := range rvs {
+	for _, ts := range tss {
 		mn := &ts.MetricName
 		dstValue := getDstValue(mn, dstLabel)
 		srcValue := mn.GetTagValue(srcLabel)
@@ -856,6 +1192,89 @@ func transformLabelReplace(tfa *transformFuncArg) ([]*timeseries, error) {
 			mn.RemoveTag(dstLabel)
 		}
 	}
+	return tss, nil
+}
+
+func transformLabelValue(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if err := expectTransformArgsNum(args, 2); err != nil {
+		return nil, err
+	}
+	labelName, err := getString(args[1], 1)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get label name: %s", err)
+	}
+	rvs := args[0]
+	for _, ts := range rvs {
+		ts.MetricName.ResetMetricGroup()
+		labelValue := ts.MetricName.GetTagValue(labelName)
+		v, err := strconv.ParseFloat(string(labelValue), 64)
+		if err != nil {
+			v = nan
+		}
+		values := ts.Values
+		for i := range values {
+			values[i] = v
+		}
+	}
+	// Do not remove timeseries with only NaN values, so `default` could be applied to them:
+	// label_value(q, "label") default 123
+	return rvs, nil
+}
+
+func transformLabelMatch(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if err := expectTransformArgsNum(args, 3); err != nil {
+		return nil, err
+	}
+	labelName, err := getString(args[1], 1)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get label name: %s", err)
+	}
+	labelRe, err := getString(args[2], 2)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get regexp: %s", err)
+	}
+	r, err := metricsql.CompileRegexpAnchored(labelRe)
+	if err != nil {
+		return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
+	}
+	tss := args[0]
+	rvs := tss[:0]
+	for _, ts := range tss {
+		labelValue := ts.MetricName.GetTagValue(labelName)
+		if r.Match(labelValue) {
+			rvs = append(rvs, ts)
+		}
+	}
+	return rvs, nil
+}
+
+func transformLabelMismatch(tfa *transformFuncArg) ([]*timeseries, error) {
+	args := tfa.args
+	if err := expectTransformArgsNum(args, 3); err != nil {
+		return nil, err
+	}
+	labelName, err := getString(args[1], 1)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get label name: %s", err)
+	}
+	labelRe, err := getString(args[2], 2)
+	if err != nil {
+		return nil, fmt.Errorf("cannot get regexp: %s", err)
+	}
+	r, err := metricsql.CompileRegexpAnchored(labelRe)
+	if err != nil {
+		return nil, fmt.Errorf(`cannot compile regexp %q: %s`, labelRe, err)
+	}
+	tss := args[0]
+	rvs := tss[:0]
+	for _, ts := range tss {
+		labelValue := ts.MetricName.GetTagValue(labelName)
+		if !r.Match(labelValue) {
+			rvs = append(rvs, ts)
+		}
+	}
 	return rvs, nil
 }

@@ -921,7 +1340,7 @@ func transformScalar(tfa *transformFuncArg) ([]*timeseries, error) {

 	// Verify whether the arg is a string.
 	// Then try converting the string to number.
-	if se, ok := tfa.fe.Args[0].(*stringExpr); ok {
+	if se, ok := tfa.fe.Args[0].(*metricsql.StringExpr); ok {
 		n, err := strconv.ParseFloat(se.S, 64)
 		if err != nil {
 			n = nan
@@ -938,6 +1357,29 @@ func transformScalar(tfa *transformFuncArg) ([]*timeseries, error) {
 	return arg, nil
 }

+func newTransformFuncSortByLabel(isDesc bool) transformFunc {
+	return func(tfa *transformFuncArg) ([]*timeseries, error) {
+		args := tfa.args
+		if err := expectTransformArgsNum(args, 2); err != nil {
+			return nil, err
+		}
+		label, err := getString(args[1], 1)
+		if err != nil {
+			return nil, fmt.Errorf("cannot parse label name for sorting: %s", err)
+		}
+		rvs := args[0]
+		sort.SliceStable(rvs, func(i, j int) bool {
+			a := rvs[i].MetricName.GetTagValue(label)
+			b := rvs[j].MetricName.GetTagValue(label)
+			if isDesc {
+				return string(b) < string(a)
+			}
+			return string(a) < string(b)
+		})
+		return rvs, nil
+	}
+}
+
 func newTransformFuncSort(isDesc bool) transformFunc {
 	return func(tfa *transformFuncArg) ([]*timeseries, error) {
 		args := tfa.args
@@ -950,7 +1392,7 @@ func newTransformFuncSort(isDesc bool) transformFunc {
 			b := rvs[j].Values
 			n := len(a) - 1
 			for n >= 0 {
-				if !math.IsNaN(a[n]) && !math.IsNaN(b[n]) {
+				if !math.IsNaN(a[n]) && !math.IsNaN(b[n]) && a[n] != b[n] {
 					break
 				}
 				n--
@@ -958,11 +1400,10 @@ func newTransformFuncSort(isDesc bool) transformFunc {
 			if n < 0 {
 				return false
 			}
-			cmp := a[n] < b[n]
 			if isDesc {
-				cmp = !cmp
+				return b[n] < a[n]
 			}
-			return cmp
+			return a[n] < b[n]
 		})
 		return rvs, nil
 	}
@@ -1052,7 +1493,10 @@ func transformTimestamp(tfa *transformFuncArg) ([]*timeseries, error) {
 		ts.MetricName.ResetMetricGroup()
 		values := ts.Values
 		for i, t := range ts.Timestamps {
-			values[i] = float64(t) / 1e3
+			v := values[i]
+			if !math.IsNaN(v) {
+				values[i] = float64(t) / 1e3
+			}
 		}
 	}
 	return rvs, nil
@@ -1090,9 +1534,7 @@ func transformStart(tfa *transformFuncArg) float64 {
 }

 func transformEnd(tfa *transformFuncArg) float64 {
-	// Subtract step from end, since it shouldn't go to the range.
-	// See Exec func for details.
-	return float64(tfa.ec.End-tfa.ec.Step) * 1e-3
+	return float64(tfa.ec.End) * 1e-3
 }

 // copyTimeseriesMetricNames returns a copy of arg with real copy of MetricNames,
--- a/app/vmstorage/Makefile
+++ b/app/vmstorage/Makefile
@@ -0,0 +1,38 @@
+# All these commands must run from repository root.
+
+run-vmstorage:
+	mkdir -p vmstorage-data
+	DOCKER_OPTS='-v $(shell pwd)/vmstorage-data:/vmstorage-data' \
+	APP_NAME=vmstorage \
+	ARGS='-retentionPeriod=12' \
+	$(MAKE) run-via-docker
+
+vmstorage:
+	APP_NAME=vmstorage $(MAKE) app-local
+
+vmstorage-race:
+	APP_NAME=vmstorage RACE=-race $(MAKE) app-local
+
+vmstorage-prod:
+	APP_NAME=vmstorage $(MAKE) app-via-docker
+
+vmstorage-pure-prod:
+	APP_NAME=vmstorage $(MAKE) app-via-docker-pure
+
+vmstorage-prod-race:
+	APP_NAME=vmstorage RACE=-race $(MAKE) app-via-docker
+
+vmstorage-pure:
+	APP_NAME=vmstorage $(MAKE) app-local-pure
+
+package-vmstorage:
+	APP_NAME=vmstorage $(MAKE) package-via-docker
+
+package-vmstorage-race:
+	APP_NAME=vmstorage RACE=-race $(MAKE) package-via-docker
+
+publish-vmstorage:
+	APP_NAME=vmstorage $(MAKE) publish-via-docker
+
+publish-vmstorage-race:
+	APP_NAME=vmstorage RACE=-race $(MAKE) publish-via-docker
--- a/app/vmstorage/README.md
+++ b/app/vmstorage/README.md
@@ -1,5 +1,5 @@
 `vmstorage` performs the following tasks:

- Accepts inserts from `vminsert` and stores them to local storage.
+- Accepts inserts from `vminsert` nodes and stores them to local storage.

- Performs select requests from `vmselect`.
+- Performs select requests from `vmselect` nodes.
--- a/app/vmstorage/deployment/Dockerfile
+++ b/app/vmstorage/deployment/Dockerfile
@@ -0,0 +1,10 @@
+ARG certs_image
+FROM $certs_image AS certs
+FROM scratch
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+ARG src_binary
+COPY $src_binary ./vmstorage-prod
+EXPOSE 8482
+EXPOSE 8400
+EXPOSE 8401
+ENTRYPOINT ["/vmstorage-prod"]
--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@@ -1,4 +1,4 @@
-package vmstorage
+package main

 import (
 	"flag"
@@ -8,122 +8,99 @@ import (
 	"sync"
 	"time"

-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/transport"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/syncwg"
 	"github.com/VictoriaMetrics/metrics"
 )

 var (
+	httpListenAddr  = flag.String("httpListenAddr", ":8482", "Address to listen for http connections")
 	retentionPeriod = flag.Int("retentionPeriod", 1, "Retention period in months")
+	storageDataPath = flag.String("storageDataPath", "vmstorage-data", "Path to storage data")
+	vminsertAddr    = flag.String("vminsertAddr", ":8400", "TCP address to accept connections from vminsert services")
+	vmselectAddr    = flag.String("vmselectAddr", ":8401", "TCP address to accept connections from vmselect services")
 	snapshotAuthKey = flag.String("snapshotAuthKey", "", "authKey, which must be passed in query string to /snapshot* pages")

-	precisionBits = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss")
-
-	// DataPath is a path to storage data.
-	DataPath = flag.String("storageDataPath", "victoria-metrics-data", "Path to storage data")
+	bigMergeConcurrency   = flag.Int("bigMergeConcurrency", 0, "The maximum number of CPU cores to use for big merges. Default value is used if set to 0")
+	smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of CPU cores to use for small merges. Default value is used if set to 0")
+	minScrapeInterval     = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
+		"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
+		"Deduplication is disabled if the -dedup.minScrapeInterval is 0")
 )

-// Init initializes vmstorage.
-func Init() {
-	if err := encoding.CheckPrecisionBits(uint8(*precisionBits)); err != nil {
-		logger.Fatalf("invalid `-precisionBits`: %s", err)
-	}
-	logger.Infof("opening storage at %q with retention period %d months", *DataPath, *retentionPeriod)
+func main() {
+	envflag.Parse()
+	buildinfo.Init()
+	logger.Init()
+
+	storage.SetMinScrapeIntervalForDeduplication(*minScrapeInterval)
+	storage.SetBigMergeWorkersCount(*bigMergeConcurrency)
+	storage.SetSmallMergeWorkersCount(*smallMergeConcurrency)
+
+	logger.Infof("opening storage at %q with retention period %d months", *storageDataPath, *retentionPeriod)
 	startTime := time.Now()
-	strg, err := storage.OpenStorage(*DataPath, *retentionPeriod)
+	strg, err := storage.OpenStorage(*storageDataPath, *retentionPeriod)
 	if err != nil {
-		logger.Fatalf("cannot open a storage at %s with retention period %d months: %s", *DataPath, *retentionPeriod, err)
+		logger.Fatalf("cannot open a storage at %s with retention period %d months: %s", *storageDataPath, *retentionPeriod, err)
 	}
-	Storage = strg

 	var m storage.Metrics
-	Storage.UpdateMetrics(&m)
+	strg.UpdateMetrics(&m)
 	tm := &m.TableMetrics
 	partsCount := tm.SmallPartsCount + tm.BigPartsCount
 	blocksCount := tm.SmallBlocksCount + tm.BigBlocksCount
 	rowsCount := tm.SmallRowsCount + tm.BigRowsCount
-	logger.Infof("successfully opened storage %q in %s; partsCount: %d; blocksCount: %d; rowsCount: %d",
-		*DataPath, time.Since(startTime), partsCount, blocksCount, rowsCount)
+	sizeBytes := tm.SmallSizeBytes + tm.BigSizeBytes
+	logger.Infof("successfully opened storage %q in %.3f seconds; partsCount: %d; blocksCount: %d; rowsCount: %d; sizeBytes: %d",
+		*storageDataPath, time.Since(startTime).Seconds(), partsCount, blocksCount, rowsCount, sizeBytes)

-	registerStorageMetrics(Storage)
-}
+	registerStorageMetrics(strg)

-// Storage is a storage.
-//
-// Every storage call must be wrapped into WG.Add(1) ... WG.Done()
-// for proper graceful shutdown when Stop is called.
-var Storage *storage.Storage
-
-// WG must be incremented before Storage call.
-//
-// Use syncwg instead of sync, since Add is called from concurrent goroutines.
-var WG syncwg.WaitGroup
-
-// AddRows adds mrs to the storage.
-func AddRows(mrs []storage.MetricRow) error {
-	WG.Add(1)
-	err := Storage.AddRows(mrs, uint8(*precisionBits))
-	WG.Done()
-	return err
-}
-
-// DeleteMetrics deletes metrics matching tfss.
-//
-// Returns the number of deleted metrics.
-func DeleteMetrics(tfss []*storage.TagFilters) (int, error) {
-	WG.Add(1)
-	n, err := Storage.DeleteMetrics(tfss)
-	WG.Done()
-	return n, err
-}
-
-// SearchTagKeys searches for tag keys
-func SearchTagKeys(maxTagKeys int) ([]string, error) {
-	WG.Add(1)
-	keys, err := Storage.SearchTagKeys(maxTagKeys)
-	WG.Done()
-	return keys, err
-}
-
-// SearchTagValues searches for tag values for the given tagKey
-func SearchTagValues(tagKey []byte, maxTagValues int) ([]string, error) {
-	WG.Add(1)
-	values, err := Storage.SearchTagValues(tagKey, maxTagValues)
-	WG.Done()
-	return values, err
-}
-
-// GetSeriesCount returns the number of time series in the storage.
-func GetSeriesCount() (uint64, error) {
-	WG.Add(1)
-	n, err := Storage.GetSeriesCount()
-	WG.Done()
-	return n, err
-}
-
-// Stop stops the vmstorage
-func Stop() {
-	logger.Infof("gracefully closing the storage at %s", *DataPath)
-	startTime := time.Now()
-	WG.WaitAndBlock()
-	Storage.MustClose()
-	logger.Infof("successfully closed the storage in %s", time.Since(startTime))
-
-	logger.Infof("the storage has been stopped")
-}
-
-// RequestHandler is a storage request handler.
-func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
-	path := r.URL.Path
-	prometheusCompatibleResponse := false
-	if path == "/api/v1/admin/tsdb/snapshot" {
-		// Handle Prometheus API - https://prometheus.io/docs/prometheus/latest/querying/api/#snapshot .
-		prometheusCompatibleResponse = true
-		path = "/snapshot/create"
+	srv, err := transport.NewServer(*vminsertAddr, *vmselectAddr, strg)
+	if err != nil {
+		logger.Fatalf("cannot create a server with vminsertAddr=%s, vmselectAddr=%s: %s", *vminsertAddr, *vmselectAddr, err)
 	}
+
+	go srv.RunVMInsert()
+	go srv.RunVMSelect()
+
+	requestHandler := newRequestHandler(strg)
+	go func() {
+		httpserver.Serve(*httpListenAddr, requestHandler)
+	}()
+
+	sig := procutil.WaitForSigterm()
+	logger.Infof("service received signal %s", sig)
+
+	logger.Infof("gracefully shutting down the service")
+	startTime = time.Now()
+	srv.MustClose()
+	logger.Infof("successfully shut down the service in %.3f seconds", time.Since(startTime).Seconds())
+
+	logger.Infof("gracefully closing the storage at %s", *storageDataPath)
+	startTime = time.Now()
+	strg.MustClose()
+	logger.Infof("successfully closed the storage in %.3f seconds", time.Since(startTime).Seconds())
+
+	fs.MustStopDirRemover()
+
+	logger.Infof("the vmstorage has been stopped")
+}
+
+func newRequestHandler(strg *storage.Storage) httpserver.RequestHandler {
+	return func(w http.ResponseWriter, r *http.Request) bool {
+		return requestHandler(w, r, strg)
+	}
+}
+
+func requestHandler(w http.ResponseWriter, r *http.Request, strg *storage.Storage) bool {
+	path := r.URL.Path
 	if !strings.HasPrefix(path, "/snapshot") {
 		return false
 	}
@@ -137,22 +114,18 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 	switch path {
 	case "/create":
 		w.Header().Set("Content-Type", "application/json")
-		snapshotPath, err := Storage.CreateSnapshot()
+		snapshotPath, err := strg.CreateSnapshot()
 		if err != nil {
 			msg := fmt.Sprintf("cannot create snapshot: %s", err)
 			logger.Errorf("%s", msg)
 			fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
 			return true
 		}
-		if prometheusCompatibleResponse {
-			fmt.Fprintf(w, `{"status":"success","data":{"name":%q}}`, snapshotPath)
-		} else {
-			fmt.Fprintf(w, `{"status":"ok","snapshot":%q}`, snapshotPath)
-		}
+		fmt.Fprintf(w, `{"status":"ok","snapshot":%q}`, snapshotPath)
 		return true
 	case "/list":
 		w.Header().Set("Content-Type", "application/json")
-		snapshots, err := Storage.ListSnapshots()
+		snapshots, err := strg.ListSnapshots()
 		if err != nil {
 			msg := fmt.Sprintf("cannot list snapshots: %s", err)
 			logger.Errorf("%s", msg)
@@ -171,7 +144,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 	case "/delete":
 		w.Header().Set("Content-Type", "application/json")
 		snapshotName := r.FormValue("snapshot")
-		if err := Storage.DeleteSnapshot(snapshotName); err != nil {
+		if err := strg.DeleteSnapshot(snapshotName); err != nil {
 			msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err)
 			logger.Errorf("%s", msg)
 			fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
@@ -181,7 +154,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 		return true
 	case "/delete_all":
 		w.Header().Set("Content-Type", "application/json")
-		snapshots, err := Storage.ListSnapshots()
+		snapshots, err := strg.ListSnapshots()
 		if err != nil {
 			msg := fmt.Sprintf("cannot list snapshots: %s", err)
 			logger.Errorf("%s", msg)
@@ -189,7 +162,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 			return true
 		}
 		for _, snapshotName := range snapshots {
-			if err := Storage.DeleteSnapshot(snapshotName); err != nil {
+			if err := strg.DeleteSnapshot(snapshotName); err != nil {
 				msg := fmt.Sprintf("cannot delete snapshot %q: %s", snapshotName, err)
 				logger.Errorf("%s", msg)
 				fmt.Fprintf(w, `{"status":"error","msg":%q}`, msg)
@@ -282,9 +255,30 @@ func registerStorageMetrics(strg *storage.Storage) {
 		return float64(idbm().PartsRefCount)
 	})

+	metrics.NewGauge(`vm_new_timeseries_created_total`, func() float64 {
+		return float64(idbm().NewTimeseriesCreated)
+	})
 	metrics.NewGauge(`vm_missing_tsids_for_metric_id_total`, func() float64 {
 		return float64(idbm().MissingTSIDsForMetricID)
 	})
+	metrics.NewGauge(`vm_recent_hour_metric_ids_search_calls_total`, func() float64 {
+		return float64(idbm().RecentHourMetricIDsSearchCalls)
+	})
+	metrics.NewGauge(`vm_recent_hour_metric_ids_search_hits_total`, func() float64 {
+		return float64(idbm().RecentHourMetricIDsSearchHits)
+	})
+	metrics.NewGauge(`vm_date_metric_ids_search_calls_total`, func() float64 {
+		return float64(idbm().DateMetricIDsSearchCalls)
+	})
+	metrics.NewGauge(`vm_date_metric_ids_search_hits_total`, func() float64 {
+		return float64(idbm().DateMetricIDsSearchHits)
+	})
+	metrics.NewGauge(`vm_index_blocks_with_metric_ids_processed_total`, func() float64 {
+		return float64(idbm().IndexBlocksWithMetricIDsProcessed)
+	})
+	metrics.NewGauge(`vm_index_blocks_with_metric_ids_incorrect_order_total`, func() float64 {
+		return float64(idbm().IndexBlocksWithMetricIDsIncorrectOrder)
+	})

 	metrics.NewGauge(`vm_assisted_merges_total{type="storage/small"}`, func() float64 {
 		return float64(tm().SmallAssistedMerges)
@@ -320,6 +314,39 @@ func registerStorageMetrics(strg *storage.Storage) {
 		return float64(idbm().BlocksCount)
 	})

+	metrics.NewGauge(`vm_data_size_bytes{type="storage/big"}`, func() float64 {
+		return float64(tm().BigSizeBytes)
+	})
+	metrics.NewGauge(`vm_data_size_bytes{type="storage/small"}`, func() float64 {
+		return float64(tm().SmallSizeBytes)
+	})
+	metrics.NewGauge(`vm_data_size_bytes{type="indexdb"}`, func() float64 {
+		return float64(idbm().SizeBytes)
+	})
+
+	metrics.NewGauge(`vm_rows_ignored_total{reason="big_timestamp"}`, func() float64 {
+		return float64(m().TooBigTimestampRows)
+	})
+	metrics.NewGauge(`vm_rows_ignored_total{reason="small_timestamp"}`, func() float64 {
+		return float64(m().TooSmallTimestampRows)
+	})
+
+	metrics.NewGauge(`vm_concurrent_addrows_limit_reached_total`, func() float64 {
+		return float64(m().AddRowsConcurrencyLimitReached)
+	})
+	metrics.NewGauge(`vm_concurrent_addrows_limit_timeout_total`, func() float64 {
+		return float64(m().AddRowsConcurrencyLimitTimeout)
+	})
+	metrics.NewGauge(`vm_concurrent_addrows_dropped_rows_total`, func() float64 {
+		return float64(m().AddRowsConcurrencyDroppedRows)
+	})
+	metrics.NewGauge(`vm_concurrent_addrows_capacity`, func() float64 {
+		return float64(m().AddRowsConcurrencyCapacity)
+	})
+	metrics.NewGauge(`vm_concurrent_addrows_current`, func() float64 {
+		return float64(m().AddRowsConcurrencyCurrent)
+	})
+
 	metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
 		return float64(tm().BigRowsCount)
 	})
@@ -330,6 +357,24 @@ func registerStorageMetrics(strg *storage.Storage) {
 		return float64(idbm().ItemsCount)
 	})

+	metrics.NewGauge(`vm_date_range_search_calls_total`, func() float64 {
+		return float64(idbm().DateRangeSearchCalls)
+	})
+	metrics.NewGauge(`vm_date_range_hits_total`, func() float64 {
+		return float64(idbm().DateRangeSearchHits)
+	})
+
+	metrics.NewGauge(`vm_missing_metric_names_for_metric_id_total`, func() float64 {
+		return float64(idbm().MissingMetricNamesForMetricID)
+	})
+
+	metrics.NewGauge(`vm_date_metric_id_cache_syncs_total`, func() float64 {
+		return float64(m().DateMetricIDCacheSyncsCount)
+	})
+	metrics.NewGauge(`vm_date_metric_id_cache_resets_total`, func() float64 {
+		return float64(m().DateMetricIDCacheResetsCount)
+	})
+
 	metrics.NewGauge(`vm_cache_entries{type="storage/tsid"}`, func() float64 {
 		return float64(m().TSIDCacheSize)
 	})
@@ -342,6 +387,9 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_entries{type="storage/date_metricID"}`, func() float64 {
 		return float64(m().DateMetricIDCacheSize)
 	})
+	metrics.NewGauge(`vm_cache_entries{type="storage/hour_metric_ids"}`, func() float64 {
+		return float64(m().HourMetricIDCacheSize)
+	})
 	metrics.NewGauge(`vm_cache_entries{type="storage/bigIndexBlocks"}`, func() float64 {
 		return float64(tm().BigIndexBlocksCacheSize)
 	})
@@ -357,24 +405,39 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_entries{type="indexdb/tagFilters"}`, func() float64 {
 		return float64(idbm().TagCacheSize)
 	})
+	metrics.NewGauge(`vm_cache_entries{type="indexdb/uselessTagFilters"}`, func() float64 {
+		return float64(idbm().UselessTagFiltersCacheSize)
+	})
 	metrics.NewGauge(`vm_cache_entries{type="storage/regexps"}`, func() float64 {
 		return float64(storage.RegexpCacheSize())
 	})
+	metrics.NewGauge(`vm_cache_size_entries{type="storage/prefetchedMetricIDs"}`, func() float64 {
+		return float64(m().PrefetchedMetricIDsSize)
+	})

 	metrics.NewGauge(`vm_cache_size_bytes{type="storage/tsid"}`, func() float64 {
-		return float64(m().TSIDCacheBytesSize)
+		return float64(m().TSIDCacheSizeBytes)
 	})
 	metrics.NewGauge(`vm_cache_size_bytes{type="storage/metricIDs"}`, func() float64 {
-		return float64(m().MetricIDCacheBytesSize)
+		return float64(m().MetricIDCacheSizeBytes)
 	})
 	metrics.NewGauge(`vm_cache_size_bytes{type="storage/metricName"}`, func() float64 {
-		return float64(m().MetricNameCacheBytesSize)
+		return float64(m().MetricNameCacheSizeBytes)
 	})
 	metrics.NewGauge(`vm_cache_size_bytes{type="storage/date_metricID"}`, func() float64 {
-		return float64(m().DateMetricIDCacheBytesSize)
+		return float64(m().DateMetricIDCacheSizeBytes)
+	})
+	metrics.NewGauge(`vm_cache_size_bytes{type="storage/hour_metric_ids"}`, func() float64 {
+		return float64(m().HourMetricIDCacheSizeBytes)
 	})
 	metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/tagFilters"}`, func() float64 {
-		return float64(idbm().TagCacheBytesSize)
+		return float64(idbm().TagCacheSizeBytes)
+	})
+	metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/uselessTagFilters"}`, func() float64 {
+		return float64(idbm().UselessTagFiltersCacheSizeBytes)
+	})
+	metrics.NewGauge(`vm_cache_size_bytes{type="storage/prefetchedMetricIDs"}`, func() float64 {
+		return float64(m().PrefetchedMetricIDsSizeBytes)
 	})

 	metrics.NewGauge(`vm_cache_requests_total{type="storage/tsid"}`, func() float64 {
@@ -386,9 +449,6 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_requests_total{type="storage/metricName"}`, func() float64 {
 		return float64(m().MetricNameCacheRequests)
 	})
-	metrics.NewGauge(`vm_cache_requests_total{type="storage/date_metricID"}`, func() float64 {
-		return float64(m().DateMetricIDCacheRequests)
-	})
 	metrics.NewGauge(`vm_cache_requests_total{type="storage/bigIndexBlocks"}`, func() float64 {
 		return float64(tm().BigIndexBlocksCacheRequests)
 	})
@@ -404,6 +464,9 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_requests_total{type="indexdb/tagFilters"}`, func() float64 {
 		return float64(idbm().TagCacheRequests)
 	})
+	metrics.NewGauge(`vm_cache_requests_total{type="indexdb/uselessTagFilters"}`, func() float64 {
+		return float64(idbm().UselessTagFiltersCacheRequests)
+	})
 	metrics.NewGauge(`vm_cache_requests_total{type="storage/regexps"}`, func() float64 {
 		return float64(storage.RegexpCacheRequests())
 	})
@@ -417,9 +480,6 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_misses_total{type="storage/metricName"}`, func() float64 {
 		return float64(m().MetricNameCacheMisses)
 	})
-	metrics.NewGauge(`vm_cache_misses_total{type="storage/date_metricID"}`, func() float64 {
-		return float64(m().DateMetricIDCacheMisses)
-	})
 	metrics.NewGauge(`vm_cache_misses_total{type="storage/bigIndexBlocks"}`, func() float64 {
 		return float64(tm().BigIndexBlocksCacheMisses)
 	})
@@ -435,6 +495,9 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_misses_total{type="indexdb/tagFilters"}`, func() float64 {
 		return float64(idbm().TagCacheMisses)
 	})
+	metrics.NewGauge(`vm_cache_misses_total{type="indexdb/uselessTagFilters"}`, func() float64 {
+		return float64(idbm().UselessTagFiltersCacheMisses)
+	})
 	metrics.NewGauge(`vm_cache_misses_total{type="storage/regexps"}`, func() float64 {
 		return float64(storage.RegexpCacheMisses())
 	})
@@ -449,7 +512,4 @@ func registerStorageMetrics(strg *storage.Storage) {
 	metrics.NewGauge(`vm_cache_collisions_total{type="storage/metricName"}`, func() float64 {
 		return float64(m().MetricNameCacheCollisions)
 	})
-	metrics.NewGauge(`vm_cache_collisions_total{type="storage/date_metricID"}`, func() float64 {
-		return float64(m().DateMetricIDCacheCollisions)
-	})
 }
--- a/app/vmstorage/transport/server.go
+++ b/app/vmstorage/transport/server.go
@@ -0,0 +1,788 @@
+package transport
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"net"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/consts"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/handshake"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var (
+	maxTagKeysPerSearch   = flag.Int("search.maxTagKeys", 100e3, "The maximum number of tag keys returned per search")
+	maxTagValuesPerSearch = flag.Int("search.maxTagValues", 100e3, "The maximum number of tag values returned per search")
+	maxMetricsPerSearch   = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series each search can scan")
+
+	precisionBits         = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss")
+	disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
+)
+
+// Server processes connections from vminsert and vmselect.
+type Server struct {
+	storage *storage.Storage
+
+	vminsertLN net.Listener
+	vmselectLN net.Listener
+
+	vminsertWG sync.WaitGroup
+	vmselectWG sync.WaitGroup
+
+	vminsertConnsMap connsMap
+	vmselectConnsMap connsMap
+
+	stopFlag uint64
+}
+
+type connsMap struct {
+	mu sync.Mutex
+	m  map[net.Conn]struct{}
+}
+
+func (cm *connsMap) Init() {
+	cm.m = make(map[net.Conn]struct{})
+}
+
+func (cm *connsMap) Add(c net.Conn) {
+	cm.mu.Lock()
+	cm.m[c] = struct{}{}
+	cm.mu.Unlock()
+}
+
+func (cm *connsMap) Delete(c net.Conn) {
+	cm.mu.Lock()
+	delete(cm.m, c)
+	cm.mu.Unlock()
+}
+
+func (cm *connsMap) CloseAll() {
+	cm.mu.Lock()
+	for c := range cm.m {
+		_ = c.Close()
+	}
+	cm.mu.Unlock()
+}
+
+// NewServer returns new Server.
+func NewServer(vminsertAddr, vmselectAddr string, storage *storage.Storage) (*Server, error) {
+	vminsertLN, err := netutil.NewTCPListener("vminsert", vminsertAddr)
+	if err != nil {
+		return nil, fmt.Errorf("unable to listen vminsertAddr %s: %s", vminsertAddr, err)
+	}
+	vmselectLN, err := netutil.NewTCPListener("vmselect", vmselectAddr)
+	if err != nil {
+		return nil, fmt.Errorf("unable to listen vmselectAddr %s: %s", vmselectAddr, err)
+	}
+	if err := encoding.CheckPrecisionBits(uint8(*precisionBits)); err != nil {
+		return nil, fmt.Errorf("invalid -precisionBits: %s", err)
+	}
+
+	// Set network-level write timeouts to reasonable values in order to protect
+	// from broken networks.
+	// Do not set read timeouts, since they are managed separately -
+	// search for SetReadDeadline in this file.
+	vminsertLN.WriteTimeout = time.Minute
+	vmselectLN.WriteTimeout = time.Minute
+
+	s := &Server{
+		storage: storage,
+
+		vminsertLN: vminsertLN,
+		vmselectLN: vmselectLN,
+	}
+	s.vminsertConnsMap.Init()
+	s.vmselectConnsMap.Init()
+	return s, nil
+}
+
+// RunVMInsert runs a server accepting connections from vminsert.
+func (s *Server) RunVMInsert() {
+	logger.Infof("accepting vminsert conns at %s", s.vminsertLN.Addr())
+	for {
+		c, err := s.vminsertLN.Accept()
+		if err != nil {
+			if pe, ok := err.(net.Error); ok && pe.Temporary() {
+				continue
+			}
+			if s.isStopping() {
+				return
+			}
+			logger.Panicf("FATAL: cannot process vminsert conns at %s: %s", s.vminsertLN.Addr(), err)
+		}
+		logger.Infof("accepted vminsert conn from %s", c.RemoteAddr())
+
+		vminsertConns.Inc()
+		s.vminsertConnsMap.Add(c)
+		s.vminsertWG.Add(1)
+		go func() {
+			defer func() {
+				s.vminsertConnsMap.Delete(c)
+				vminsertConns.Dec()
+				s.vminsertWG.Done()
+			}()
+
+			// There is no need in response compression, since
+			// vmstorage doesn't send anything back to vminsert.
+			compressionLevel := 0
+			bc, err := handshake.VMInsertServer(c, compressionLevel)
+			if err != nil {
+				if s.isStopping() {
+					// c is stopped inside Server.MustClose
+					return
+				}
+				logger.Errorf("cannot perform vminsert handshake with client %q: %s", c.RemoteAddr(), err)
+				_ = c.Close()
+				return
+			}
+			defer func() {
+				if !s.isStopping() {
+					logger.Infof("closing vminsert conn from %s", c.RemoteAddr())
+				}
+				_ = bc.Close()
+			}()
+
+			logger.Infof("processing vminsert conn from %s", c.RemoteAddr())
+			if err := s.processVMInsertConn(bc); err != nil {
+				if s.isStopping() {
+					return
+				}
+				vminsertConnErrors.Inc()
+				logger.Errorf("cannot process vminsert conn from %s: %s", c.RemoteAddr(), err)
+			}
+		}()
+	}
+}
+
+var (
+	vminsertConns      = metrics.NewCounter("vm_vminsert_conns")
+	vminsertConnErrors = metrics.NewCounter("vm_vminsert_conn_errors_total")
+)
+
+// RunVMSelect runs a server accepting connections from vmselect.
+func (s *Server) RunVMSelect() {
+	logger.Infof("accepting vmselect conns at %s", s.vmselectLN.Addr())
+	for {
+		c, err := s.vmselectLN.Accept()
+		if err != nil {
+			if pe, ok := err.(net.Error); ok && pe.Temporary() {
+				continue
+			}
+			if s.isStopping() {
+				return
+			}
+			logger.Panicf("FATAL: cannot process vmselect conns at %s: %s", s.vmselectLN.Addr(), err)
+		}
+		logger.Infof("accepted vmselect conn from %s", c.RemoteAddr())
+
+		vmselectConns.Inc()
+		s.vmselectConnsMap.Add(c)
+		s.vmselectWG.Add(1)
+		go func() {
+			defer func() {
+				s.vmselectConnsMap.Delete(c)
+				vmselectConns.Dec()
+				s.vmselectWG.Done()
+			}()
+
+			// Compress responses to vmselect even if they already contain compressed blocks.
+			// Responses contain uncompressed metric names, which should compress well
+			// when the response contains high number of time series.
+			// Additionally, recently added metric blocks are usually uncompressed, so the compression
+			// should save network bandwidth.
+			compressionLevel := 1
+			if *disableRPCCompression {
+				compressionLevel = 0
+			}
+			bc, err := handshake.VMSelectServer(c, compressionLevel)
+			if err != nil {
+				if s.isStopping() {
+					// c is closed inside Server.MustClose
+					return
+				}
+				logger.Errorf("cannot perform vmselect handshake with client %q: %s", c.RemoteAddr(), err)
+				_ = c.Close()
+				return
+			}
+
+			defer func() {
+				if !s.isStopping() {
+					logger.Infof("closing vmselect conn from %s", c.RemoteAddr())
+				}
+				_ = bc.Close()
+			}()
+
+			logger.Infof("processing vmselect conn from %s", c.RemoteAddr())
+			if err := s.processVMSelectConn(bc); err != nil {
+				if s.isStopping() {
+					return
+				}
+				vmselectConnErrors.Inc()
+				logger.Errorf("cannot process vmselect conn %s: %s", c.RemoteAddr(), err)
+			}
+		}()
+	}
+}
+
+var (
+	vmselectConns      = metrics.NewCounter("vm_vmselect_conns")
+	vmselectConnErrors = metrics.NewCounter("vm_vmselect_conn_errors_total")
+)
+
+// MustClose gracefully closes the server,
+// so it no longer touches s.storage after returning.
+func (s *Server) MustClose() {
+	// Mark the server as stoping.
+	s.setIsStopping()
+
+	// Stop accepting new connections from vminsert and vmselect.
+	if err := s.vminsertLN.Close(); err != nil {
+		logger.Panicf("FATAL: cannot close vminsert listener: %s", err)
+	}
+	if err := s.vmselectLN.Close(); err != nil {
+		logger.Panicf("FATAL: cannot close vmselect listener: %s", err)
+	}
+
+	// Close existing connections from vminsert, so the goroutines
+	// processing these connections are finished.
+	s.vminsertConnsMap.CloseAll()
+
+	// Close existing connections from vmselect, so the goroutines
+	// processing these connections are finished.
+	s.vmselectConnsMap.CloseAll()
+
+	// Wait until all the goroutines processing vminsert and vmselect conns
+	// are finished.
+	s.vminsertWG.Wait()
+	s.vmselectWG.Wait()
+}
+
+func (s *Server) setIsStopping() {
+	atomic.StoreUint64(&s.stopFlag, 1)
+}
+
+func (s *Server) isStopping() bool {
+	return atomic.LoadUint64(&s.stopFlag) != 0
+}
+
+func (s *Server) processVMInsertConn(r io.Reader) error {
+	sizeBuf := make([]byte, 8)
+	var buf []byte
+	var mrs []storage.MetricRow
+	for {
+		if _, err := io.ReadFull(r, sizeBuf); err != nil {
+			if err == io.EOF {
+				// Remote end gracefully closed the connection.
+				return nil
+			}
+			return fmt.Errorf("cannot read packet size: %s", err)
+		}
+		packetSize := encoding.UnmarshalUint64(sizeBuf)
+		if packetSize > consts.MaxInsertPacketSize {
+			return fmt.Errorf("too big packet size: %d; shouldn't exceed %d", packetSize, consts.MaxInsertPacketSize)
+		}
+		buf = bytesutil.Resize(buf, int(packetSize))
+		if n, err := io.ReadFull(r, buf); err != nil {
+			return fmt.Errorf("cannot read packet with size %d: %s; read only %d bytes", packetSize, err, n)
+		}
+		vminsertPacketsRead.Inc()
+
+		// Read metric rows from the packet.
+		mrs = mrs[:0]
+		tail := buf
+		for len(tail) > 0 {
+			if len(mrs) < cap(mrs) {
+				mrs = mrs[:len(mrs)+1]
+			} else {
+				mrs = append(mrs, storage.MetricRow{})
+			}
+			mr := &mrs[len(mrs)-1]
+			var err error
+			tail, err = mr.Unmarshal(tail)
+			if err != nil {
+				return fmt.Errorf("cannot unmarshal MetricRow: %s", err)
+			}
+		}
+		vminsertMetricsRead.Add(len(mrs))
+		if err := s.storage.AddRows(mrs, uint8(*precisionBits)); err != nil {
+			return fmt.Errorf("cannot store metrics: %s", err)
+		}
+	}
+}
+
+var (
+	vminsertPacketsRead = metrics.NewCounter("vm_vminsert_packets_read_total")
+	vminsertMetricsRead = metrics.NewCounter("vm_vminsert_metrics_read_total")
+)
+
+func (s *Server) processVMSelectConn(bc *handshake.BufferedConn) error {
+	ctx := &vmselectRequestCtx{
+		bc:      bc,
+		sizeBuf: make([]byte, 8),
+	}
+	for {
+		if err := s.processVMSelectRequest(ctx); err != nil {
+			if err == io.EOF {
+				// Remote client gracefully closed the connection.
+				return nil
+			}
+			return fmt.Errorf("cannot process vmselect request: %s", err)
+		}
+		if err := bc.Flush(); err != nil {
+			return fmt.Errorf("cannot flush compressed buffers: %s", err)
+		}
+	}
+}
+
+type vmselectRequestCtx struct {
+	bc      *handshake.BufferedConn
+	sizeBuf []byte
+	dataBuf []byte
+
+	sq   storage.SearchQuery
+	tfss []*storage.TagFilters
+	sr   storage.Search
+}
+
+func (ctx *vmselectRequestCtx) readUint32() (uint32, error) {
+	ctx.sizeBuf = bytesutil.Resize(ctx.sizeBuf, 4)
+	if _, err := io.ReadFull(ctx.bc, ctx.sizeBuf); err != nil {
+		if err == io.EOF {
+			return 0, err
+		}
+		return 0, fmt.Errorf("cannot read uint32: %s", err)
+	}
+	n := encoding.UnmarshalUint32(ctx.sizeBuf)
+	return n, nil
+}
+
+func (ctx *vmselectRequestCtx) readDataBufBytes(maxDataSize int) error {
+	ctx.sizeBuf = bytesutil.Resize(ctx.sizeBuf, 8)
+	if _, err := io.ReadFull(ctx.bc, ctx.sizeBuf); err != nil {
+		if err == io.EOF {
+			return err
+		}
+		return fmt.Errorf("cannot read data size: %s", err)
+	}
+	dataSize := encoding.UnmarshalUint64(ctx.sizeBuf)
+	if dataSize > uint64(maxDataSize) {
+		return fmt.Errorf("too big data size: %d; it mustn't exceed %d bytes", dataSize, maxDataSize)
+	}
+	ctx.dataBuf = bytesutil.Resize(ctx.dataBuf, int(dataSize))
+	if dataSize == 0 {
+		return nil
+	}
+	if n, err := io.ReadFull(ctx.bc, ctx.dataBuf); err != nil {
+		return fmt.Errorf("cannot read data with size %d: %s; read only %d bytes", dataSize, err, n)
+	}
+	return nil
+}
+
+func (ctx *vmselectRequestCtx) readBool() (bool, error) {
+	ctx.dataBuf = bytesutil.Resize(ctx.dataBuf, 1)
+	if _, err := io.ReadFull(ctx.bc, ctx.dataBuf); err != nil {
+		if err == io.EOF {
+			return false, err
+		}
+		return false, fmt.Errorf("cannot read bool: %s", err)
+	}
+	v := ctx.dataBuf[0] != 0
+	return v, nil
+}
+
+func (ctx *vmselectRequestCtx) writeDataBufBytes() error {
+	if err := ctx.writeUint64(uint64(len(ctx.dataBuf))); err != nil {
+		return fmt.Errorf("cannot write data size: %s", err)
+	}
+	if len(ctx.dataBuf) == 0 {
+		return nil
+	}
+	if _, err := ctx.bc.Write(ctx.dataBuf); err != nil {
+		return fmt.Errorf("cannot write data with size %d: %s", len(ctx.dataBuf), err)
+	}
+	return nil
+}
+
+// maxErrorMessageSize is the maximum size of error message to send to clients.
+const maxErrorMessageSize = 64 * 1024
+
+func (ctx *vmselectRequestCtx) writeErrorMessage(err error) error {
+	errMsg := err.Error()
+	if len(errMsg) > maxErrorMessageSize {
+		// Trim too long error message.
+		errMsg = errMsg[:maxErrorMessageSize]
+	}
+	if err := ctx.writeString(errMsg); err != nil {
+		return fmt.Errorf("cannot send error message %q to client: %s", errMsg, err)
+	}
+	return nil
+}
+
+func (ctx *vmselectRequestCtx) writeString(s string) error {
+	ctx.dataBuf = append(ctx.dataBuf[:0], s...)
+	return ctx.writeDataBufBytes()
+}
+
+func (ctx *vmselectRequestCtx) writeUint64(n uint64) error {
+	ctx.sizeBuf = encoding.MarshalUint64(ctx.sizeBuf[:0], n)
+	if _, err := ctx.bc.Write(ctx.sizeBuf); err != nil {
+		return fmt.Errorf("cannot write uint64 %d: %s", n, err)
+	}
+	return nil
+}
+
+const maxRPCNameSize = 128
+
+var zeroTime time.Time
+
+func (s *Server) processVMSelectRequest(ctx *vmselectRequestCtx) error {
+	// Read rpcName
+	// Do not set deadline on reading rpcName, since it may take a
+	// lot of time for idle connection.
+	if err := ctx.readDataBufBytes(maxRPCNameSize); err != nil {
+		if err == io.EOF {
+			// Remote client gracefully closed the connection.
+			return err
+		}
+		return fmt.Errorf("cannot read rpcName: %s", err)
+	}
+
+	// Limit the time required for reading request args.
+	if err := ctx.bc.SetReadDeadline(time.Now().Add(5 * time.Second)); err != nil {
+		return fmt.Errorf("cannot set read deadline for reading request args: %s", err)
+	}
+	defer func() {
+		_ = ctx.bc.SetReadDeadline(zeroTime)
+	}()
+
+	switch string(ctx.dataBuf) {
+	case "search_v3":
+		return s.processVMSelectSearchQuery(ctx)
+	case "labelValues":
+		return s.processVMSelectLabelValues(ctx)
+	case "labelEntries":
+		return s.processVMSelectLabelEntries(ctx)
+	case "labels":
+		return s.processVMSelectLabels(ctx)
+	case "seriesCount":
+		return s.processVMSelectSeriesCount(ctx)
+	case "deleteMetrics_v2":
+		return s.processVMSelectDeleteMetrics(ctx)
+	default:
+		return fmt.Errorf("unsupported rpcName: %q", ctx.dataBuf)
+	}
+}
+
+const maxTagFiltersSize = 64 * 1024
+
+func (s *Server) processVMSelectDeleteMetrics(ctx *vmselectRequestCtx) error {
+	vmselectDeleteMetricsRequests.Inc()
+
+	// Read request
+	if err := ctx.readDataBufBytes(maxTagFiltersSize); err != nil {
+		return fmt.Errorf("cannot read labelName: %s", err)
+	}
+	tail, err := ctx.sq.Unmarshal(ctx.dataBuf)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal SearchQuery: %s", err)
+	}
+	if len(tail) > 0 {
+		return fmt.Errorf("unexpected non-zero tail left after unmarshaling SearchQuery: (len=%d) %q", len(tail), tail)
+	}
+
+	// Setup ctx.tfss
+	if err := ctx.setupTfss(); err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Delete the given metrics.
+	deletedCount, err := s.storage.DeleteMetrics(ctx.tfss)
+	if err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send an empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+	// Send deletedCount to vmselect.
+	if err := ctx.writeUint64(uint64(deletedCount)); err != nil {
+		return fmt.Errorf("cannot send deletedCount=%d: %s", deletedCount, err)
+	}
+	return nil
+}
+
+func (s *Server) processVMSelectLabels(ctx *vmselectRequestCtx) error {
+	vmselectLabelsRequests.Inc()
+
+	// Read request
+	accountID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read accountID: %s", err)
+	}
+	projectID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read projectID: %s", err)
+	}
+
+	// Search for tag keys
+	labels, err := s.storage.SearchTagKeys(accountID, projectID, *maxTagKeysPerSearch)
+	if err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send an empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+
+	// Send labels to vmselect
+	for _, label := range labels {
+		if len(label) == 0 {
+			// Do this substitution in order to prevent clashing with 'end of response' marker.
+			label = "__name__"
+		}
+		if err := ctx.writeString(label); err != nil {
+			return fmt.Errorf("cannot write label %q: %s", label, err)
+		}
+	}
+
+	// Send 'end of response' marker
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send 'end of response' marker")
+	}
+	return nil
+}
+
+const maxLabelValueSize = 16 * 1024
+
+func (s *Server) processVMSelectLabelValues(ctx *vmselectRequestCtx) error {
+	vmselectLabelValuesRequests.Inc()
+
+	// Read request
+	accountID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read accountID: %s", err)
+	}
+	projectID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read projectID: %s", err)
+	}
+	if err := ctx.readDataBufBytes(maxLabelValueSize); err != nil {
+		return fmt.Errorf("cannot read labelName: %s", err)
+	}
+	labelName := ctx.dataBuf
+
+	// Search for tag values
+	labelValues, err := s.storage.SearchTagValues(accountID, projectID, labelName, *maxTagValuesPerSearch)
+	if err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send an empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+
+	return writeLabelValues(ctx, labelValues)
+}
+
+func writeLabelValues(ctx *vmselectRequestCtx, labelValues []string) error {
+	for _, labelValue := range labelValues {
+		if len(labelValue) == 0 {
+			// Skip empty label values, since they have no sense for prometheus.
+			continue
+		}
+		if err := ctx.writeString(labelValue); err != nil {
+			return fmt.Errorf("cannot write labelValue %q: %s", labelValue, err)
+		}
+	}
+	// Send 'end of label values' marker
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send 'end of response' marker")
+	}
+	return nil
+}
+
+func (s *Server) processVMSelectLabelEntries(ctx *vmselectRequestCtx) error {
+	vmselectLabelEntriesRequests.Inc()
+
+	// Read request
+	accountID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read accountID: %s", err)
+	}
+	projectID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read projectID: %s", err)
+	}
+
+	// Perform the request
+	labelEntries, err := s.storage.SearchTagEntries(accountID, projectID, *maxTagKeysPerSearch, *maxTagValuesPerSearch)
+	if err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send an empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+
+	// Send labelEntries to vmselect
+	for i := range labelEntries {
+		e := &labelEntries[i]
+		label := e.Key
+		if label == "" {
+			// Do this substitution in order to prevent clashing with 'end of response' marker.
+			label = "__name__"
+		}
+		if err := ctx.writeString(label); err != nil {
+			return fmt.Errorf("cannot write label %q: %s", label, err)
+		}
+		if err := writeLabelValues(ctx, e.Values); err != nil {
+			return fmt.Errorf("cannot write label values for %q: %s", label, err)
+		}
+	}
+
+	// Send 'end of response' marker
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send 'end of response' marker")
+	}
+	return nil
+}
+
+func (s *Server) processVMSelectSeriesCount(ctx *vmselectRequestCtx) error {
+	vmselectSeriesCountRequests.Inc()
+
+	// Read request
+	accountID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read accountID: %s", err)
+	}
+	projectID, err := ctx.readUint32()
+	if err != nil {
+		return fmt.Errorf("cannot read projectID: %s", err)
+	}
+
+	// Execute the request
+	n, err := s.storage.GetSeriesCount(accountID, projectID)
+	if err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send an empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+
+	// Send series count to vmselect.
+	if err := ctx.writeUint64(n); err != nil {
+		return fmt.Errorf("cannot write series count to vmselect: %s", err)
+	}
+	return nil
+}
+
+// maxSearchQuerySize is the maximum size of SearchQuery packet in bytes.
+const maxSearchQuerySize = 1024 * 1024
+
+func (s *Server) processVMSelectSearchQuery(ctx *vmselectRequestCtx) error {
+	vmselectSearchQueryRequests.Inc()
+
+	// Read search query.
+	if err := ctx.readDataBufBytes(maxSearchQuerySize); err != nil {
+		return fmt.Errorf("cannot read searchQuery: %s", err)
+	}
+	tail, err := ctx.sq.Unmarshal(ctx.dataBuf)
+	if err != nil {
+		return fmt.Errorf("cannot unmarshal SearchQuery: %s", err)
+	}
+	if len(tail) > 0 {
+		return fmt.Errorf("unexpected non-zero tail left after unmarshaling SearchQuery: (len=%d) %q", len(tail), tail)
+	}
+	fetchData, err := ctx.readBool()
+	if err != nil {
+		return fmt.Errorf("cannot read `fetchData` bool: %s", err)
+	}
+
+	// Setup search.
+	if err := ctx.setupTfss(); err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+	tr := storage.TimeRange{
+		MinTimestamp: ctx.sq.MinTimestamp,
+		MaxTimestamp: ctx.sq.MaxTimestamp,
+	}
+	ctx.sr.Init(s.storage, ctx.tfss, tr, fetchData, *maxMetricsPerSearch)
+	defer ctx.sr.MustClose()
+	if err := ctx.sr.Error(); err != nil {
+		return ctx.writeErrorMessage(err)
+	}
+
+	// Send empty error message to vmselect.
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send empty error message: %s", err)
+	}
+
+	// Send found blocks to vmselect.
+	for ctx.sr.NextMetricBlock() {
+		mb := ctx.sr.MetricBlock
+
+		vmselectMetricBlocksRead.Inc()
+		vmselectMetricRowsRead.Add(mb.Block.RowsCount())
+
+		ctx.dataBuf = mb.Marshal(ctx.dataBuf[:0])
+		if err := ctx.writeDataBufBytes(); err != nil {
+			return fmt.Errorf("cannot send MetricBlock: %s", err)
+		}
+	}
+	if err := ctx.sr.Error(); err != nil {
+		return fmt.Errorf("search error: %s", err)
+	}
+
+	// Send 'end of response' marker
+	if err := ctx.writeString(""); err != nil {
+		return fmt.Errorf("cannot send 'end of response' marker")
+	}
+	return nil
+}
+
+var (
+	vmselectDeleteMetricsRequests = metrics.NewCounter("vm_vmselect_delete_metrics_requests_total")
+	vmselectLabelsRequests        = metrics.NewCounter("vm_vmselect_labels_requests_total")
+	vmselectLabelValuesRequests   = metrics.NewCounter("vm_vmselect_label_values_requests_total")
+	vmselectLabelEntriesRequests  = metrics.NewCounter("vm_vmselect_label_entries_requests_total")
+	vmselectSeriesCountRequests   = metrics.NewCounter("vm_vmselect_series_count_requests_total")
+	vmselectSearchQueryRequests   = metrics.NewCounter("vm_vmselect_search_query_requests_total")
+	vmselectMetricBlocksRead      = metrics.NewCounter("vm_vmselect_metric_blocks_read_total")
+	vmselectMetricRowsRead        = metrics.NewCounter("vm_vmselect_metric_rows_read_total")
+)
+
+func (ctx *vmselectRequestCtx) setupTfss() error {
+	tfss := ctx.tfss[:0]
+	for _, tagFilters := range ctx.sq.TagFilterss {
+		if len(tfss) < cap(tfss) {
+			tfss = tfss[:len(tfss)+1]
+		} else {
+			tfss = append(tfss, &storage.TagFilters{})
+		}
+		tfs := tfss[len(tfss)-1]
+		tfs.Reset(ctx.sq.AccountID, ctx.sq.ProjectID)
+		for i := range tagFilters {
+			tf := &tagFilters[i]
+			if err := tfs.Add(tf.Key, tf.Value, tf.IsNegative, tf.IsRegexp); err != nil {
+				return fmt.Errorf("cannot parse tag filter %s: %s", tf, err)
+			}
+		}
+	}
+	ctx.tfss = tfss
+	return nil
+}
--- a/dashboards/victoriametrics.json
+++ b/dashboards/victoriametrics.json
--- a/deployment/docker/Makefile
+++ b/deployment/docker/Makefile
@@ -1,13 +1,15 @@
-DOCKER_NAMESPACE := valyala
-BUILDER_IMAGE := local/builder:go1.12.5
-CERTS_IMAGE := local/certs:1.0.2
+# All these commands must run from repository root.
+
+DOCKER_NAMESPACE := docker.io/victoriametrics
+BUILDER_IMAGE := local/builder:go1.14.0
+CERTS_IMAGE := local/certs:1.0.3

 package-certs:
-	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(CERTS_IMAGE)') \
+	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(CERTS_IMAGE)$$') \
 		|| docker build -t $(CERTS_IMAGE) deployment/docker/certs

 package-builder:
-	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(BUILDER_IMAGE)') \
+	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(BUILDER_IMAGE)$$') \
 		|| docker build -t $(BUILDER_IMAGE) deployment/docker/builder

 app-via-docker: package-certs package-builder
@@ -18,25 +20,125 @@ app-via-docker: package-certs package-builder
 		-w /VictoriaMetrics \
 		--mount type=bind,src="$(shell pwd)/gocache-for-docker",dst=/gocache \
 		--env GOCACHE=/gocache \
+		--env GO111MODULE=on \
+		$(DOCKER_OPTS) \
 		$(BUILDER_IMAGE) \
-		go build $(RACE) -mod=vendor -ldflags "-s -w -extldflags '-static' $(GO_BUILDINFO)" -tags 'netgo osusergo' -o bin/$(APP_NAME)-prod $(PKG_PREFIX)/app/$(APP_NAME)
+		go build $(RACE) -mod=vendor -trimpath -ldflags "-s -w -extldflags '-static' $(GO_BUILDINFO)" -tags 'netgo osusergo' \
+			-o bin/$(APP_NAME)$(APP_SUFFIX)-prod $(PKG_PREFIX)/app/$(APP_NAME)

 package-via-docker:
-	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE)') || (\
+	(docker image ls --format '{{.Repository}}:{{.Tag}}' | grep -q '$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE)$$') || (\
 		$(MAKE) app-via-docker && \
-		docker build -t $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) -f app/$(APP_NAME)/deployment/Dockerfile .)
+		docker build \
+			--build-arg src_binary=$(APP_NAME)$(APP_SUFFIX)-prod \
+			--build-arg certs_image=$(CERTS_IMAGE) \
+			-t $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) \
+			-f app/$(APP_NAME)/deployment/Dockerfile bin)

-publish-via-docker: package-via-docker
-	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE)
-	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest
-	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):latest
+package-manifest: \
+		package-via-docker-amd64 \
+		package-via-docker-arm \
+		package-via-docker-arm64 \
+		package-via-docker-ppc64le \
+		package-via-docker-386
+	$(MAKE) package-manifest-internal
+
+package-manifest-internal:
+	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-amd64$(RACE)
+	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm$(RACE)
+	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm64$(RACE)
+	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-ppc64le$(RACE)
+	docker push $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-386$(RACE)
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create --amend $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-amd64$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm64$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-ppc64le$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-386$(RACE)
+	GOARCH=amd64 $(MAKE) package-manifest-annotate-goarch
+	GOARCH=arm $(MAKE) package-manifest-annotate-goarch
+	GOARCH=arm64 $(MAKE) package-manifest-annotate-goarch
+	GOARCH=ppc64le $(MAKE) package-manifest-annotate-goarch
+	GOARCH=386 $(MAKE) package-manifest-annotate-goarch
+
+package-manifest-annotate-goarch:
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest annotate $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) \
+				$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-$(GOARCH)$(RACE) --os linux --arch $(GOARCH)
+
+publish-via-docker: package-manifest
+	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-amd64$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest-amd64$(RACE)
+	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest-arm$(RACE)
+	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-arm64$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest-arm64$(RACE)
+	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-ppc64le$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest-ppc64le$(RACE)
+	docker tag $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)-386$(RACE) $(DOCKER_NAMESPACE)/$(APP_NAME):latest-386$(RACE)
+	PKG_TAG=latest $(MAKE) package-manifest-internal
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push --purge $(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE)
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push --purge $(DOCKER_NAMESPACE)/$(APP_NAME):latest$(RACE)

 run-via-docker: package-via-docker
 	docker run -it --rm \
 		--user $(shell id -u):$(shell id -g) \
 		--net host \
 		$(DOCKER_OPTS) \
-		$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(RACE) $(ARGS)
+		$(DOCKER_NAMESPACE)/$(APP_NAME):$(PKG_TAG)$(APP_SUFFIX)$(RACE) $(ARGS)
+
+app-via-docker-goarch:
+	APP_SUFFIX='-$(GOARCH)' \
+	DOCKER_OPTS='--env CGO_ENABLED=$(CGO_ENABLED) --env GOOS=linux --env GOARCH=$(GOARCH)' \
+	$(MAKE) app-via-docker
+
+app-via-docker-goarch-cgo:
+	CGO_ENABLED=1 $(MAKE) app-via-docker-goarch
+
+app-via-docker-goarch-nocgo:
+	CGO_ENABLED=0 $(MAKE) app-via-docker-goarch
+
+app-via-docker-pure:
+	APP_SUFFIX='-pure' DOCKER_OPTS='--env CGO_ENABLED=0' $(MAKE) app-via-docker
+
+app-via-docker-amd64:
+	GOARCH=amd64 $(MAKE) app-via-docker-goarch-cgo
+
+app-via-docker-arm:
+	GOARCH=arm $(MAKE) app-via-docker-goarch-nocgo
+
+app-via-docker-arm64:
+	GOARCH=arm64 $(MAKE) app-via-docker-goarch-nocgo
+
+app-via-docker-ppc64le:
+	GOARCH=ppc64le $(MAKE) app-via-docker-goarch-nocgo
+
+app-via-docker-386:
+	GOARCH=386 $(MAKE) app-via-docker-goarch-nocgo
+
+package-via-docker-goarch:
+	APP_SUFFIX='-$(GOARCH)' \
+	DOCKER_OPTS='--env CGO_ENABLED=$(CGO_ENABLED) --env GOOS=linux --env GOARCH=$(GOARCH)' \
+	$(MAKE) package-via-docker
+
+package-via-docker-goarch-cgo:
+	CGO_ENABLED=1 $(MAKE) package-via-docker-goarch
+
+package-via-docker-goarch-nocgo:
+	CGO_ENABLED=0 $(MAKE) package-via-docker-goarch
+
+package-via-docker-pure:
+	APP_SUFFIX='-pure' DOCKER_OPTS='--env CGO_ENABLED=0' $(MAKE) package-via-docker
+
+package-via-docker-amd64:
+	GOARCH=amd64 $(MAKE) package-via-docker-goarch-cgo
+
+package-via-docker-arm:
+	GOARCH=arm $(MAKE) package-via-docker-goarch-nocgo
+
+package-via-docker-arm64:
+	GOARCH=arm64 $(MAKE) package-via-docker-goarch-nocgo
+
+package-via-docker-ppc64le:
+	GOARCH=ppc64le $(MAKE) package-via-docker-goarch-nocgo
+
+package-via-docker-386:
+	GOARCH=386 $(MAKE) package-via-docker-goarch-nocgo

 remove-docker-images:
 	docker image ls --format '{{.Repository}}\t{{.ID}}' | grep $(DOCKER_NAMESPACE)/ | grep -v /builder | awk '{print $$2}' | xargs docker image rm -f
--- a/deployment/docker/builder/Dockerfile
+++ b/deployment/docker/builder/Dockerfile
@@ -1 +1,2 @@
-FROM golang:1.12.5
+FROM golang:1.14.0
+STOPSIGNAL SIGINT
--- a/Show More
+++ b/Show More