Compare commits

...

12 Commits

Author SHA1 Message Date
Max Kotliar
ad4c703c6e follow-up on b3054bba - remove .codex
Follow-up on
b3054bbadd
2026-06-18 23:36:14 +03:00
Max Kotliar
841b1ddfa9 follow-up on 710c920d - remove leftovers
Follow-up on
710c920d60
2026-06-18 18:08:40 +03:00
Max Kotliar
35696ad72a docs/changelog: chore changelog messages 2026-06-18 13:39:49 +03:00
Max Kotliar
169379ef04 app/vmrestore: disallow restoring parts outside the configured -storageDataPath directory (#1051)
A specifically crafted backup source (compromised S3/GCS/Azure bucket) could use `..`
components in object names to write files outside the `-storageDataPath`
directory.

Fix by validating all source parts against the destination directory
before restore begins (restore.go), and adding a defense-in-depth panic
guard in `NewDirectWriteCloser` (fslocal.go).

PR https://github.com/VictoriaMetrics/VictoriaMetrics-enterprise/pull/1051

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
2026-06-18 13:21:18 +03:00
JAYICE
2a853e52e6 app/vmagent: support sharding targets by label in vmagent cluster mode (#11114)
Previously, targets were sharded among `vmagent` instances by all target labels after relabeling. The commit adds `-promscrape.cluster.shardByLabels` optional flag to shard targets by specified labels.

For example, with `-promscrape.cluster.shardByLabels=service`, the targets with the same `service` label value will be scraped by the same `vmagent` instance, 
which is useful when performing stream aggregation (drop pod label) that requires all metrics with the same `service` label value to be processed on the same `vmagent` instance. 

If none of the specified labels are present in the target labels, then all target labels will be used for sharding.

Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11044
PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11114
2026-06-18 13:13:23 +03:00
Zasda Yusuf Mikail
54a3bffd13 app/vmctl: push metrics before exiting on error (#11081)
Call `pushmetrics.StopAndPush()` before `vmctl` exits on `app.Run` errors

PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11081

Signed-off-by: Zasda Mikail <zasdaym@gmail.com>
2026-06-18 12:57:59 +03:00
Aliaksandr Valialkin
659465cfde docs/victoriametrics/Articles.md: add https://xata.io/blog/how-we-rebuilt-postgresql-branch-metrics-on-victoriametrics-per-cell 2026-06-18 00:19:37 +02:00
Fred Navruzov
ad8f15a42a docs/vmanomaly: release v1.29.6 (#11132)
Update vmanomaly docs (/anomaly-detection) for patch release v1.29.6

PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11132
2026-06-17 20:09:35 +03:00
Max Kotliar
eec4f80c4c app/vmauth: refactor oidc discovery pool creation (#11123)
Initializing `oidcDP` outside or before `parseJWTUsers` would simplify its reuse with SSO implementation https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11122

PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11123
2026-06-17 13:53:00 +03:00
Nikolay
cde82838c2 lib/fs: retry file deletion on NFS error
This commit adds additional file removal retries.
This is follow-up for
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/9842.

According to the stack trace, NFS also could return error for
`deleteFilePath`.
This retry is safe, since part already removed from `parts.json` and
performs a skips empty directories on restart.

Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11060
2026-06-17 09:17:51 +02:00
Alexander Frolov
dd5e128525 app/vmagent: properly copy metadata unit value
Previously, Unit value at metrics metadata was incorrectly referenced instead of memory copy:

ed795a8443/app/vmagent/remotewrite/pendingseries.go (L209-L212)

 This commit properly copies Unit into dedicated buffer and prevents memory corruption.

Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11120
2026-06-17 09:17:51 +02:00
Max Kotliar
cb8a6c9902 lib/httpserver: add comments and links to tlsErrorSkipLogger
Follow-up on
64e43e59a7

See discussion in
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10538#issuecomment-4708196302
2026-06-16 18:39:45 +03:00
26 changed files with 284 additions and 52 deletions

0
.codex
View File

View File

@@ -209,13 +209,12 @@ func (wr *writeRequest) tryPushMetadata(mms []prompb.MetricMetadata) bool {
func (wr *writeRequest) copyMetadata(dst, src *prompb.MetricMetadata) {
// Direct copy for non-string fields, which are safe by value.
dst.Type = src.Type
dst.Unit = src.Unit
dst.AccountID = src.AccountID
dst.ProjectID = src.ProjectID
// Pre-allocate memory for all string fields.
neededBufLen := len(src.MetricFamilyName) + len(src.Help)
neededBufLen := len(src.MetricFamilyName) + len(src.Help) + len(src.Unit)
bufLen := len(wr.metadatabuf)
wr.metadatabuf = slicesutil.SetLength(wr.metadatabuf, bufLen+neededBufLen)
buf := wr.metadatabuf[:bufLen]
@@ -230,6 +229,11 @@ func (wr *writeRequest) copyMetadata(dst, src *prompb.MetricMetadata) {
buf = append(buf, src.Help...)
dst.Help = bytesutil.ToUnsafeString(buf[bufLen:])
// Copy Unit
bufLen = len(buf)
buf = append(buf, src.Unit...)
dst.Unit = bytesutil.ToUnsafeString(buf[bufLen:])
wr.metadatabuf = buf
}

View File

@@ -911,7 +911,8 @@ func reloadAuthConfigData(data []byte) (bool, error) {
return false, fmt.Errorf("failed to parse auth config: %w", err)
}
jui, oidcDP, err := parseJWTUsers(ac)
oidcDP := &oidcDiscovererPool{}
jui, err := parseJWTUsers(ac, oidcDP)
if err != nil {
return false, fmt.Errorf("failed to parse JWT users from auth config: %w", err)
}

View File

@@ -72,9 +72,8 @@ type JWTConfig struct {
verifierPool atomic.Pointer[jwt.VerifierPool]
}
func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
func parseJWTUsers(ac *AuthConfig, oidcDP *oidcDiscovererPool) ([]*UserInfo, error) {
jui := make([]*UserInfo, 0, len(ac.Users))
oidcDP := &oidcDiscovererPool{}
uniqClaims := make(map[string]*UserInfo)
var sortedClaims []string
@@ -85,10 +84,10 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
}
if ui.AuthToken != "" || ui.BearerToken != "" || ui.Username != "" || ui.Password != "" {
return nil, nil, fmt.Errorf("auth_token, bearer_token, username and password cannot be specified if jwt is set")
return nil, fmt.Errorf("auth_token, bearer_token, username and password cannot be specified if jwt is set")
}
if len(jwtToken.PublicKeys) == 0 && len(jwtToken.PublicKeyFiles) == 0 && !jwtToken.SkipVerify && jwtToken.OIDC == nil {
return nil, nil, fmt.Errorf("jwt must contain at least a single public key, public_key_files, oidc or have skip_verify=true")
return nil, fmt.Errorf("jwt must contain at least a single public key, public_key_files, oidc or have skip_verify=true")
}
var claimsString string
sortedClaims = sortedClaims[:0]
@@ -97,7 +96,7 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
sortedClaims = append(sortedClaims, fmt.Sprintf("%s=%s", ck, cv))
pc, err := jwt.NewClaim(ck, cv)
if err != nil {
return nil, nil, fmt.Errorf("incorrect match claim, key=%q, value regex=%q: %w", ck, cv, err)
return nil, fmt.Errorf("incorrect match claim, key=%q, value regex=%q: %w", ck, cv, err)
}
parsedClaims = append(parsedClaims, pc)
}
@@ -106,7 +105,7 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
claimsString = strings.Join(sortedClaims, ",")
if oldUI, ok := uniqClaims[claimsString]; ok {
return nil, nil, fmt.Errorf("duplicate match claims=%q found for name=%q at idx=%d; the previous one is set for name=%q", claimsString, ui.Name, idx, oldUI.Name)
return nil, fmt.Errorf("duplicate match claims=%q found for name=%q at idx=%d; the previous one is set for name=%q", claimsString, ui.Name, idx, oldUI.Name)
}
uniqClaims[claimsString] = &ui
if len(jwtToken.PublicKeys) > 0 || len(jwtToken.PublicKeyFiles) > 0 {
@@ -115,7 +114,7 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
for i := range jwtToken.PublicKeys {
k, err := jwt.ParseKey([]byte(jwtToken.PublicKeys[i]))
if err != nil {
return nil, nil, err
return nil, err
}
keys = append(keys, k)
}
@@ -123,52 +122,52 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
for _, filePath := range jwtToken.PublicKeyFiles {
keyData, err := os.ReadFile(filePath)
if err != nil {
return nil, nil, fmt.Errorf("cannot read public key from file %q: %w", filePath, err)
return nil, fmt.Errorf("cannot read public key from file %q: %w", filePath, err)
}
k, err := jwt.ParseKey(keyData)
if err != nil {
return nil, nil, fmt.Errorf("cannot parse public key from file %q: %w", filePath, err)
return nil, fmt.Errorf("cannot parse public key from file %q: %w", filePath, err)
}
keys = append(keys, k)
}
vp, err := jwt.NewVerifierPool(keys)
if err != nil {
return nil, nil, err
return nil, err
}
jwtToken.verifierPool.Store(vp)
}
if jwtToken.OIDC != nil {
if len(jwtToken.PublicKeys) > 0 || len(jwtToken.PublicKeyFiles) > 0 || jwtToken.SkipVerify {
return nil, nil, fmt.Errorf("jwt with oidc cannot contain public keys or have skip_verify=true")
return nil, fmt.Errorf("jwt with oidc cannot contain public keys or have skip_verify=true")
}
if jwtToken.OIDC.Issuer == "" {
return nil, nil, fmt.Errorf("oidc issuer cannot be empty")
return nil, fmt.Errorf("oidc issuer cannot be empty")
}
isserURL, err := url.Parse(jwtToken.OIDC.Issuer)
if err != nil {
return nil, nil, fmt.Errorf("oidc issuer %q must be a valid URL", jwtToken.OIDC.Issuer)
return nil, fmt.Errorf("oidc issuer %q must be a valid URL", jwtToken.OIDC.Issuer)
}
if isserURL.Scheme != "https" && isserURL.Scheme != "http" {
return nil, nil, fmt.Errorf("oidc issuer %q must have http or https scheme", jwtToken.OIDC.Issuer)
return nil, fmt.Errorf("oidc issuer %q must have http or https scheme", jwtToken.OIDC.Issuer)
}
oidcDP.createOrAdd(ui.JWT.OIDC.Issuer, &ui.JWT.verifierPool)
}
if err := parseJWTPlaceholdersForUserInfo(&ui, true); err != nil {
return nil, nil, err
return nil, err
}
if err := ui.initURLs(); err != nil {
return nil, nil, err
return nil, err
}
metricLabels, err := ui.getMetricLabels()
if err != nil {
return nil, nil, fmt.Errorf("cannot parse metric_labels: %w", err)
return nil, fmt.Errorf("cannot parse metric_labels: %w", err)
}
ui.requests = ac.ms.GetOrCreateCounter(`vmauth_user_requests_total` + metricLabels)
ui.requestErrors = ac.ms.GetOrCreateCounter(`vmauth_user_request_errors_total` + metricLabels)
@@ -187,7 +186,7 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
rt, err := newRoundTripper(ui.TLSCAFile, ui.TLSCertFile, ui.TLSKeyFile, ui.TLSServerName, ui.TLSInsecureSkipVerify)
if err != nil {
return nil, nil, fmt.Errorf("cannot initialize HTTP RoundTripper: %w", err)
return nil, fmt.Errorf("cannot initialize HTTP RoundTripper: %w", err)
}
ui.rt = rt
@@ -200,7 +199,7 @@ func parseJWTUsers(ac *AuthConfig) ([]*UserInfo, *oidcDiscovererPool, error) {
return len(jui[i].JWT.MatchClaims) > len(jui[j].JWT.MatchClaims)
})
return jui, oidcDP, nil
return jui, nil
}
var tokenPool sync.Pool

View File

@@ -39,16 +39,14 @@ XOtclIk1uhc03oL9nOQ=
}
return
}
users, oidcDP, err := parseJWTUsers(ac)
oidcDP := &oidcDiscovererPool{}
users, err := parseJWTUsers(ac, oidcDP)
if err == nil {
t.Fatalf("expecting non-nil error; got %v", users)
}
if expErr != err.Error() {
t.Fatalf("unexpected error; got\n%q\nwant \n%q", err.Error(), expErr)
}
if oidcDP != nil {
t.Fatalf("expecting nil oidcDP; got %v", oidcDP)
}
}
// unauthorized_user cannot be used with jwt
@@ -326,7 +324,8 @@ XOtclIk1uhc03oL9nOQ=
t.Fatalf("unexpected error: %s", err)
}
jui, oidcDP, err := parseJWTUsers(ac)
oidcDP := &oidcDiscovererPool{}
jui, err := parseJWTUsers(ac, oidcDP)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}

View File

@@ -563,11 +563,11 @@ func main() {
}()
err = app.Run(os.Args)
pushmetrics.StopAndPush()
if err != nil {
log.Fatalln(err)
}
log.Printf("Total time: %v", time.Since(start))
pushmetrics.StopAndPush()
}
func initConfigVM(c *cli.Context) (vm.Config, error) {

View File

@@ -59,7 +59,7 @@ services:
- '--external.alert.source=explore?orgId=1&left=["now-1h","now","VictoriaMetrics",{"expr": },{"mode":"Metrics"},{"ui":[true,true,true,"none"]}]'
restart: always
vmanomaly:
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
depends_on:
- "victoriametrics"
ports:

View File

@@ -14,6 +14,13 @@ aliases:
---
Please find the changelog for VictoriaMetrics Anomaly Detection below.
## v1.29.6
Released: 2026-06-17
- BUGFIX: Fixed `VLogsReader` startup and query execution when `tenant_id` is omitted or provided in short account-only form such as `"0"`. Omitted or empty tenant IDs are treated as single-node/no-tenant mode, and account-only tenant IDs are expanded to `accountID:0` before adding VictoriaLogs `AccountID`/`ProjectID` params or VM tenant labels.
- BUGFIX: Hardened [`OnlineMADModel`](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-mad) anomaly scoring for perfectly constant time series (all values identical). The model now keeps a small deterministic prediction interval when the learned MAD is zero, so values deviating from an unknown constant baseline can produce `anomaly_score > 1` (previously, all anomaly scores were `0`).
## v1.29.5
Released: 2026-06-11

View File

@@ -423,7 +423,7 @@ services:
# ...
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
# ...
restart: always
volumes:
@@ -641,7 +641,7 @@ options:
Heres an example of using the config splitter to divide configurations based on the `extra_filters` argument from the reader section:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5 && docker image tag victoriametrics/vmanomaly:v1.29.5 vmanomaly
docker pull victoriametrics/vmanomaly:v1.29.6 && docker image tag victoriametrics/vmanomaly:v1.29.6 vmanomaly
```
```sh

View File

@@ -45,7 +45,7 @@ There are 2 types of compatibility to consider when migrating in stateful mode:
| Group start | Group end | Compatibility | Notes |
|---------|--------- |------------|-------|
| [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) | [v1.29.5](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1295) | Fully Compatible | - |
| [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) | [v1.29.6](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1296) | Fully Compatible | - |
| [v1.28.7](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1287) | [v1.29.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1290) | Partially compatible* | Dumped models of class [prophet](https://docs.victoriametrics.com/anomaly-detection/components/models/#prophet) and [seasonal quantile](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-seasonal-quantile) have problems with loading to [v1.29.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1290) due to dropped `pytz` library. **Upgrading directly from v1.28.7 to [v1.29.1](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1291) with a fix is suggested** |
| [v1.26.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1262) | [v1.28.7](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1287) | Fully Compatible | [v1.28.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1280) introduced [rolling](https://docs.victoriametrics.com/anomaly-detection/components/models/#rolling-models) model class drop in favor of [online](https://docs.victoriametrics.com/anomaly-detection/components/models/#online-models) models (`rolling_quantile` and `std` models), however, it does not impact compatibility, as artifacts were not produced by default for rolling models. Also, offline `mad` and `zscore` models are redirecting to their respective online counterparts since [v1.28.4](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1284). |
| [v1.25.3](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1253) | [v1.26.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1270) | Partially Compatible* | [v1.25.3](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1253) introduced `forecast_at` argument for base [univariate](https://docs.victoriametrics.com/anomaly-detection/components/models/#univariate-models) and `Prophet` [models](https://docs.victoriametrics.com/anomaly-detection/components/models/#prophet), however, itself remains backward-reversible from newer states like [v1.26.2](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1262), [v1.27.0](https://docs.victoriametrics.com/anomaly-detection/changelog/#v1270). (All models except `isolation_forest_multivariate` class will be dropped) |

View File

@@ -132,7 +132,7 @@ Below are the steps to get `vmanomaly` up and running inside a Docker container:
1. Pull Docker image:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5
docker pull victoriametrics/vmanomaly:v1.29.6
```
2. Create the license file with your license key.
@@ -152,7 +152,7 @@ docker run -it \
-v ./license:/license \
-v ./config.yaml:/config.yaml \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
/config.yaml \
--licenseFile=/license \
--loggerLevel=INFO \
@@ -169,7 +169,7 @@ docker run -it \
-e VMANOMALY_DATA_DUMPS_DIR=/tmp/vmanomaly/data \
-e VMANOMALY_MODEL_DUMPS_DIR=/tmp/vmanomaly/models \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
/config.yaml \
--licenseFile=/license \
--loggerLevel=INFO \
@@ -182,7 +182,7 @@ services:
# ...
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
# ...
restart: always
volumes:

View File

@@ -315,7 +315,7 @@ docker run -it --rm \
-e VMANOMALY_MCP_SERVER_URL=http://mcp-vmanomaly:8081/mcp \
-p 8080:8080 \
-p 8490:8490 \
victoriametrics/vmanomaly:v1.29.5 \
victoriametrics/vmanomaly:v1.29.6 \
vmanomaly_config.yaml
```

View File

@@ -1265,7 +1265,7 @@ monitoring:
Let's pull the docker image for `vmanomaly`:
```sh
docker pull victoriametrics/vmanomaly:v1.29.5
docker pull victoriametrics/vmanomaly:v1.29.6
```
Now we can run the docker container putting as volumes both config and model file:
@@ -1279,7 +1279,7 @@ docker run -it \
-v $(PWD)/license:/license \
-v $(PWD)/custom_model.py:/vmanomaly/model/custom.py \
-v $(PWD)/custom.yaml:/config.yaml \
victoriametrics/vmanomaly:v1.29.5 /config.yaml \
victoriametrics/vmanomaly:v1.29.6 /config.yaml \
--licenseFile=/license
--watch
```

View File

@@ -395,7 +395,7 @@ services:
restart: always
vmanomaly:
container_name: vmanomaly
image: victoriametrics/vmanomaly:v1.29.5
image: victoriametrics/vmanomaly:v1.29.6
depends_on:
- "victoriametrics"
ports:

View File

@@ -19,6 +19,7 @@ See also [case studies](https://docs.victoriametrics.com/victoriametrics/casestu
* [Datanami: Why Roblox Picked VictoriaMetrics for Observability Data Overhaul](https://www.hpcwire.com/bigdatawire/2023/05/30/why-roblox-picked-victoriametrics-for-observability-data-overhaul/)
* [Cloudflare: Introducing notifications for HTTP Traffic Anomalies](https://blog.cloudflare.com/introducing-http-traffic-anomalies-notifications/)
* [Grammarly: Better, Faster, Cheaper: How Grammarly Improved Monitoring by Over 10x with VictoriaMetrics](https://www.grammarly.com/blog/engineering/monitoring-with-victoriametrics/)
* [Xata: How we rebuilt PostgreSQL branch metrics on VictoriaMetrics, per cell](https://xata.io/blog/how-we-rebuilt-postgresql-branch-metrics-on-victoriametrics-per-cell)
* [CERN: CMS monitoring R&D: Real-time monitoring and alerts](https://indico.cern.ch/event/877333/contributions/3696707/attachments/1972189/3281133/CMS_mon_RD_for_opInt.pdf)
* [CERN: The CMS monitoring infrastructure and applications](https://arxiv.org/pdf/2007.03630.pdf)
* [Forbes: The (Almost) Infinitely Scalable Open Source Monitoring Dream](https://www.forbes.com/sites/adrianbridgwater/2022/08/16/the-almost-infinitely-scalable-open-source-monitoring-dream/)

View File

@@ -30,11 +30,16 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
* FEATURE: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): log calls to [/api/v1/admin/tsdb/delete_series](https://docs.victoriametrics.com/victoriametrics/url-examples/#apiv1admintsdbdelete_series) API handler. This should help to identify events of metrics deletion from the database. See [#11104](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11104).
* FEATURE: [vmctl](https://docs.victoriametrics.com/victoriametrics/vmctl/): add `-vm-headers` and `-vm-bearer-token` flags for authenticating requests to the VictoriaMetrics import destination. The flags are available in `opentsdb`, `influx`, `remote-read`, `prometheus`, `mimir`, and `thanos` vmctl sub-commands. See [#8897](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8897).
* FEATURE: [vmui](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#vmui): add the `last` value to graph legend statistics. See [#10759](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10759).
* FEATURE: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): add `-promscrape.cluster.shardByLabels` command-line flag for selecting target labels used for sharding scrape targets among `vmagent` instances in cluster mode. See [#11044](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11044).
* BUGFIX: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): fix issue with producing aggregated samples with identical timestamps between flushes. See [#10808](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10808).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/victoriametrics/vmalert/),[vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/),[vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) and [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/): fix rare unbounded shutdown delay when config reload takes longer than `-configCheckInterval`. See [#11107](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11107). Thanks to @PleasingFungus for contribution.
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): fix corrupted metrics metadata when a response contains multiple rows. See [#11115](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11115). Thanks for @fxrlv for the contribution.
* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): do not fail backup list if directory is absent while using `fs://` destination to align with other protocols. See [6c3c548](https://github.com/VictoriaMetrics/VictoriaMetrics/commit/6c3c548ddb0385b749e731f52276f130e2a4e4a8)
* BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): fix potential corruption of remote-write metadata `Unit` values. See [#11120](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11120). Thanks for @fxrlv for the contribution.
* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): do not fail backup list if directory is absent while using `fs://` destination to align with other protocols. See [6c3c548d](https://github.com/VictoriaMetrics/VictoriaMetrics/commit/6c3c548ddb0385b749e731f52276f130e2a4e4a8)
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): prevent more cases of panic during directory deletion on `NFS`-based mounts. See [#11060](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11060).
* BUGFIX: [vmctl](https://docs.victoriametrics.com/victoriametrics/vmctl/): push metrics to configured `-pushmetrics.url` on shutdown when migration fails. Previously, metrics were not pushed if vmctl exited with an error. See [#11081](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11081). Thanks to @zasdaym for contribution.
* BUGFIX: [vmrestore](https://docs.victoriametrics.com/victoriametrics/vmrestore/): disallow restoring parts outside the configured `-storageDataPath` directory. See [710c920d](https://github.com/VictoriaMetrics/VictoriaMetrics/commit/710c920d6083327042a309e449fae4383617d817).
## [v1.145.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.145.0)

View File

@@ -797,6 +797,12 @@ For example, the following commands spread scrape targets among a cluster of two
The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes.
The pod name must end with a number in the range `0 ... promscrape.cluster.membersCount-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`.
By default, targets are sharded among `vmagent` instances by all target labels after relabeling.
Use `-promscrape.cluster.shardByLabels` {{% available_from "#" %}} to shard targets by specified labels instead.
For example, with `-promscrape.cluster.shardByLabels=service`, the targets with the same `service` label value will be scraped by the same `vmagent` instance,
which is useful when perform stream aggregation that requires all metrics with the same `service` label value to be processed on the same `vmagent` instance.
If none of the specified labels are present in the target labels, then all target labels will be used for sharding.
By default, each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances,
then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands
start a cluster of three `vmagent` instances, where two `vmagent` instances scrape each target:

View File

@@ -91,6 +91,11 @@ func (r *Restore) Run(ctx context.Context) error {
if err != nil {
return fmt.Errorf("cannot list src parts: %w", err)
}
for _, srcPart := range srcParts {
if !srcPart.IsLocalPathInsideDir(r.Dst.Dir) {
return fmt.Errorf("part file %s would be written outside storage directory %s", srcPart.Path, r.Dst.Dir)
}
}
logger.Infof("obtaining list of parts at %s", dst)
dstParts, err := dst.ListParts()
if err != nil {

View File

@@ -120,6 +120,17 @@ func (p *Part) ParseFromRemotePath(remotePath string) bool {
return true
}
// IsLocalPathInsideDir returns true if the part's local path resolves inside dir.
// It resolves ../../ sequences and prevents path traversal outside dir.
func (p *Part) IsLocalPathInsideDir(dir string) bool {
dir = filepath.Clean(dir)
if dir == `/` {
return true
}
return strings.HasPrefix(p.LocalPath(dir), dir+string(filepath.Separator))
}
// MaxPartSize is the maximum size for each part.
//
// The MaxPartSize reduces bandwidth usage during retires on network errors

View File

@@ -0,0 +1,54 @@
package common
import (
"testing"
)
func TestIsLocalPathInsideDir(t *testing.T) {
f := func(dir, path string, expected bool) {
t.Helper()
p := Part{Path: path}
if got := p.IsLocalPathInsideDir(dir); got != expected {
t.Fatalf("IsLocalPathInsideDir(%q, %q): got %v, want %v", dir, path, got, expected)
}
}
// normal path inside dir
f("/data/storage", "parts/segment1/data.bin", true)
// dir with trailing slash is normalized
f("/data/storage/", "parts/segment1/data.bin", true)
// deeply nested path
f("/data/storage", "a/b/c/d/e/file.dat", true)
// traversal that stays inside dir
f("/data/storage", "foo/../bar/file.dat", true)
// root dir allows any path
f("/", "any/path/here", true)
// root dir allows traversal attempts since nothing is outside /
f("/", "../outside/marker.txt", true)
// path with leading slash is treated as relative by filepath.Join and stays inside dir
f("/data/storage", "/outside/marker.txt", true)
// dir with .. components is normalized; path inside resolved dir
f("/data/storage/../foo", "parts/file.dat", true)
// dir with .. components is normalized; traversal outside resolved dir
f("/data/storage/../foo", "../storage/evil.txt", false)
// simple traversal
f("/data/storage", "../outside/marker.txt", false)
// traversal with trailing slash in dir
f("/data/storage/", "../outside/marker.txt", false)
// deep traversal
f("/data/storage", "a/../../outside/marker.txt", false)
// sibling directory whose name shares a prefix with dir
f("/data/storage", "../storagefoo/evil.txt", false)
}

View File

@@ -129,6 +129,10 @@ func (fs *FS) NewReadCloser(p common.Part) (io.ReadCloser, error) {
// On platforms with preallocation, writes go to a .tmp file that must be
// finalized with FinalizeFile.
func (fs *FS) NewDirectWriteCloser(p common.Part) (io.WriteCloser, error) {
if !p.IsLocalPathInsideDir(fs.Dir) {
logger.Fatalf("BUG: part file %s would be written outside storage directory %s", p.Path, fs.Dir)
}
path := fs.writePath(p)
if err := fs.mkdirAll(path); err != nil {
return nil, err

View File

@@ -179,6 +179,9 @@ func tryRemoveDir(dirPath string) bool {
// times simultaneously and properly close it, fs caching may still
// confuse NFS client.
if err := os.RemoveAll(dirEntryPath); err != nil {
if os.IsNotExist(err) {
return
}
if !isTemporaryNFSError(err) {
logger.Fatalf("FATAL: cannot remove %q: %s", dirEntryPath, err)
}
@@ -203,8 +206,9 @@ func tryRemoveDir(dirPath string) bool {
deleteFilePath := filepath.Join(dirPath, deleteDirFilename)
// Remove the deleteDirFilename file, since there are no other entries left in the directory.
MustRemovePath(deleteFilePath)
if !tryRemovePath(deleteFilePath) {
return false
}
// Sync the directory after the removing deletDirFilename file in order to make sure
// all the metadata files are removed at some exotic filesystems such as OSSFS2.
// See https://github.com/VictoriaMetrics/VictoriaLogs/issues/649
@@ -212,7 +216,9 @@ func tryRemoveDir(dirPath string) bool {
MustSyncPath(dirPath)
// Remove the dirPath itself
MustRemovePath(dirPath)
if !tryRemovePath(dirPath) {
return false
}
// Do not sync the parent directory for the dirPath - the caller can do this if needed.
// It is OK if the dirPath will remain undeleted after unclean shutdown - it will be deleted
@@ -221,6 +227,23 @@ func tryRemoveDir(dirPath string) bool {
return true
}
// tryRemovePath removes given path and returns true on success
// or false if error is temporary NFS error
func tryRemovePath(path string) bool {
if err := os.Remove(path); err != nil {
if os.IsNotExist(err) {
return true
}
if !isTemporaryNFSError(err) {
logger.Fatalf("FATAL: cannot remove %q: %s", path, err)
}
nfsDirRemoveFailedAttempts.Inc()
return false
}
return true
}
var (
dirRemoverWG sync.WaitGroup
nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`)

View File

@@ -811,10 +811,20 @@ func LogError(req *http.Request, errStr string) {
logger.Errorf("uri: %s, remote address: %q: %s", uri, remoteAddr, errStr)
}
// tlsErrorSkipLogger must be passed as the out argument to log.New only.
// It suppresses noisy TCP probe errors on TLS connections to avoid log pollution.
//
// This cannot be implemented in net.Listener because a TLS handshake may take seconds,
// during which no other connections can be accepted. Therefor, the implementation inside net.Listener can lead to DoS.
// Once a connection is passed to the conn serve goroutine, there is no direct access to the handshake logic, so this indirect
// approach is used instead.
type tlsErrorSkipLogger struct{}
// Write filters out TLS handshake errors from health-check probes.
// log.Logger guarantees that each complete message is delivered in a single Write call
// and that calls are serialized, so we can safely inspect p for a "TLS handshake error".
// See https://github.com/golang/go/blob/38e988efb4b8f5e73e887027f386a342c138b649/src/log/log.go#L53-L57
func (*tlsErrorSkipLogger) Write(p []byte) (int, error) {
// skip common health check errors produced by Kubernetes and other tools
if bytes.Contains(p, []byte("TLS handshake error")) &&
(bytes.Contains(p, []byte("EOF")) || bytes.Contains(p, []byte("connection reset by peer"))) {
return len(p), nil

View File

@@ -76,6 +76,9 @@ var (
"Every %d occurrence in the template is substituted with -promscrape.cluster.memberNum at urls to vmagent instances responsible for scraping the given target "+
"at /service-discovery page. For example -promscrape.cluster.memberURLTemplate='http://vmagent-%d:8429/targets'. "+
"See https://docs.victoriametrics.com/victoriametrics/vmagent/#scraping-big-number-of-targets for more details")
clusterShardByLabels = flagutil.NewArrayString("promscrape.cluster.shardByLabels", "Optional list of target labels, which will be used for sharding targets among cluster members "+
"if -promscrape.cluster.membersCount is greater than 1. If none of the specified labels are found in a target, then all the target labels will be used for sharding. "+
"See https://docs.victoriametrics.com/victoriametrics/vmagent/#scraping-big-number-of-targets for more info")
clusterReplicationFactor = flag.Int("promscrape.cluster.replicationFactor", 1, "The number of members in the cluster, which scrape the same targets. "+
"If the replication factor is greater than 1, then the deduplication must be enabled at remote storage side. "+
"See https://docs.victoriametrics.com/victoriametrics/vmagent/#scraping-big-number-of-targets for more info")
@@ -86,7 +89,10 @@ var (
"Bigger uncompressed responses are rejected. See also max_scrape_size option at https://docs.victoriametrics.com/victoriametrics/sd_configs/#scrape_configs")
)
var clusterMemberID int
var (
clusterMemberID int
clusterShardByLabelsSorted []string
)
func mustInitClusterMemberID() {
s := *clusterMemberNum
@@ -110,6 +116,15 @@ func mustInitClusterMemberID() {
clusterMemberID = n
}
func initClusterShardByLabels() {
if len(*clusterShardByLabels) == 0 {
clusterShardByLabelsSorted = nil
return
}
clusterShardByLabelsSorted = slices.Clone(*clusterShardByLabels)
slices.Sort(clusterShardByLabelsSorted)
}
// Config represents essential parts from Prometheus config defined at https://prometheus.io/docs/prometheus/latest/configuration/configuration/
type Config struct {
Global GlobalConfig `yaml:"global,omitempty"`
@@ -1138,12 +1153,28 @@ func (stc *StaticConfig) appendScrapeWork(dst []*ScrapeWork, swc *scrapeWorkConf
}
func appendScrapeWorkKey(dst []byte, labels *promutil.Labels) []byte {
for _, label := range labels.GetLabels() {
// Do not use strconv.AppendQuote, since it is slow according to CPU profile.
dst = append(dst, label.Name...)
dst = append(dst, '=')
dst = append(dst, label.Value...)
dst = append(dst, ',')
originalDstLen := len(dst)
for _, targetLabelName := range clusterShardByLabelsSorted {
for _, label := range labels.GetLabels() {
if label.Name == targetLabelName {
// Do not use strconv.AppendQuote, since it is slow according to CPU profile.
dst = append(dst, label.Name...)
dst = append(dst, '=')
dst = append(dst, label.Value...)
dst = append(dst, ',')
break
}
}
}
// Use all labels to compute the key if `promscrape.cluster.shardByLabels` is not configured
if len(dst) == originalDstLen {
for _, label := range labels.GetLabels() {
dst = append(dst, label.Name...)
dst = append(dst, '=')
dst = append(dst, label.Value...)
dst = append(dst, ',')
}
return dst
}
return dst
}

View File

@@ -148,6 +148,77 @@ func TestGetClusterMemberNumsForScrapeWork(t *testing.T) {
f("foo", 3, 2, []int{2, 0})
}
func TestAppendScrapeWorkKeyShardByLabels(t *testing.T) {
f := func(labels map[string]string, shardByLabels []string, expectedKey string) {
t.Helper()
originValue := *clusterShardByLabels
*clusterShardByLabels = shardByLabels
defer func() {
*clusterShardByLabels = originValue
}()
initClusterShardByLabels()
outputKey := string(appendScrapeWorkKey(nil, promutil.NewLabelsFromMap(labels)))
if expectedKey != outputKey {
t.Fatalf("unexpected sharding key:%q for target labels:%v with shardByLabels=%q, expect: %q",
outputKey, labels, shardByLabels, expectedKey)
}
}
// didn't specify -promscrape.cluster.shardByLabels, so all labels will be used for sharding
f(
map[string]string{
"a": "aa",
"b": "bb",
"c": "cc",
"d": "dd"},
[]string{},
"a=aa,b=bb,c=cc,d=dd,",
)
// match all labels in -promscrape.cluster.shardByLabels, so label "a" and "c" will be used for sharding
f(
map[string]string{
"a": "aa",
"b": "bb",
"c": "cc",
"d": "dd"},
[]string{"a", "c"},
"a=aa,c=cc,",
)
// match all labels in -promscrape.cluster.shardByLabels, so label "a" and "c" will be used for sharding even if they're not in order in -promscrape.cluster.shardByLabels.
f(
map[string]string{
"a": "aa",
"b": "bb",
"c": "cc",
"d": "dd"},
[]string{"c", "a"},
"a=aa,c=cc,",
)
// match part of labels in -promscrape.cluster.shardByLabels, label "a" and "c" will be used for sharding
f(
map[string]string{
"a": "aa",
"c": "cc",
"d": "dd"},
[]string{"a", "b", "c"},
"a=aa,c=cc,",
)
// none of labels in -promscrape.cluster.shardByLabels is matched, so all labels will be used for sharding
f(
map[string]string{
"d": "dd",
"e": "ee"},
[]string{"a", "b", "c"},
"d=dd,e=ee,",
)
}
func TestLoadStaticConfigs(t *testing.T) {
scs, err := loadStaticConfigs("testdata/file_sd.json")
if err != nil {

View File

@@ -66,6 +66,7 @@ func CheckConfig() error {
// Scraped data is passed to pushData.
func Init(pushData func(at *auth.Token, wr *prompb.WriteRequest)) {
mustInitClusterMemberID()
initClusterShardByLabels()
globalStopChan = make(chan struct{})
scraperWG.Go(func() {
runScraper(*promscrapeConfigFile, pushData, globalStopChan)