Compare commits

...

9 Commits

Author SHA1 Message Date
Jiekun
47176dbc37 vmselect: use NewRequest to create cache reset propagate request 2026-06-17 15:33:39 +08:00
Jiekun
43bac32761 vmselect: properly escape auth key for resetRollupResultCache when propagating 2026-06-17 10:42:49 +08:00
Jiekun
1ea4884d8a vmselect: update doc 2026-06-17 10:06:19 +08:00
Jiekun
83d387c6bb vmselect: add cache key to query argument when propagate rollup cache reset operation 2026-06-17 10:03:22 +08:00
Jiekun
9ef8cc0036 Merge branch 'cluster' into fix/reset-cache-propagation 2026-06-16 23:16:52 +08:00
Zhu Jiekun
c27ca50fd1 Apply suggestions from code review
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
Signed-off-by: Zhu Jiekun <jiekun@victoriametrics.com>
2026-06-16 23:11:16 +08:00
Jiekun
34e858888f vmselect: re-mux the resetRollupResultCaches function 2026-06-16 23:10:22 +08:00
Jiekun
7e72ecb00a vmselect: propagate cache reset operation only when propagate argument is set 2026-06-16 23:07:46 +08:00
Jiekun
3ad3ae7f16 vmselect: propagate cache reset operation to selectNode when /internal/resetRollupResultCache is called 2026-06-16 22:27:46 +08:00
4 changed files with 52 additions and 14 deletions

View File

@@ -56,7 +56,6 @@ var (
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the last sample in every time series per each discrete interval "+
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#deduplication for details")
deleteAuthKey = flagutil.NewPassword("deleteAuthKey", "authKey for metrics' deletion via /prometheus/api/v1/admin/tsdb/delete_series and /graphite/tags/delSeries. It could be passed via authKey query arg. It overrides -httpAuth.*")
resetCacheAuthKey = flagutil.NewPassword("search.resetCacheAuthKey", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call. It could be passed via authKey query arg. It overrides -httpAuth.*")
metricNamesStatsResetAuthKey = flagutil.NewPassword("metricNamesStatsResetAuthKey", "authKey for resetting metric names usage cache via /api/v1/admin/status/metric_names_stats/reset. It overrides -httpAuth.*. "+
"See https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#track-ingested-metrics-usage")
@@ -260,11 +259,10 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
}
if path == "/internal/resetRollupResultCache" {
if !httpserver.CheckAuthFlag(w, r, resetCacheAuthKey) {
if !httpserver.CheckAuthFlag(w, r, prometheus.GetResetCacheAuthKey()) {
return true
}
promql.ResetRollupResultCache()
return true
return prometheus.ResetRollupResultCacheHandler(w, r)
}
if path == "/admin/tenants" {
tenantsRequests.Inc()

View File

@@ -51,9 +51,10 @@ var (
"If set to true, the query model becomes closer to InfluxDB data model. If set to true, then -search.maxLookback and -search.maxStalenessInterval are ignored")
maxStepForPointsAdjustment = flag.Duration("search.maxStepForPointsAdjustment", time.Minute, "The maximum step when /api/v1/query_range handler adjusts "+
"points with timestamps closer than -search.latencyOffset to the current time. The adjustment is needed because such points may contain incomplete data")
selectNodes = flagutil.NewArrayString("selectNode", "A list of vmselect node addresses to propagate the '/internal/resetRollupResultCache' call. "+
"If this flag isn't set, then cache need to be purged from each vmselect individually. "+
selectNodes = flagutil.NewArrayString("selectNode", "A list of vmselect node addresses to propagate the '/internal/resetRollupResultCache' call with 'propagate=1' argument. "+
"If this flag or the 'propagate' argument isn't set, then cache need to be purged from each vmselect individually. "+
"Comma-separated addresses of vmselect nodes; usage: -selectNode=vmselect-host1,...,vmselect-hostN")
resetCacheAuthKey = flagutil.NewPassword("search.resetCacheAuthKey", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call. It could be passed via authKey query arg. It overrides -httpAuth.*. It'll be used when reset request is propagate to other -selectNode.")
maxUniqueTimeseries = flag.Int("search.maxUniqueTimeseries", 0, "The maximum number of unique time series, which can be selected during /api/v1/query and /api/v1/query_range queries. This option allows limiting memory usage. "+
"The limit can't exceed the explicitly set corresponding value `-search.maxUniqueTimeseries` on vmstorage side.")
@@ -562,7 +563,7 @@ func DeleteHandler(startTime time.Time, at *auth.Token, r *http.Request) error {
// Reset rollup result cache on all the vmselect nodes,
// since the cache may contain deleted data.
// TODO: reset only cache for (account, project)
resetRollupResultCaches()
resetRollupResultCachesAndPropagate()
}
logger.Infof("/api/v1/admin/tsdb/delete_series has been called for %q. Deleted %d series.", sq.FiltersString(), deletedCount)
return nil
@@ -570,33 +571,69 @@ func DeleteHandler(startTime time.Time, at *auth.Token, r *http.Request) error {
var deleteDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/admin/tsdb/delete_series"}`)
// ResetRollupResultCacheHandler handle request for `/internal/resetRollupResultCache` API.
// It propagates the request if `propagate` argument is set.
func ResetRollupResultCacheHandler(w http.ResponseWriter, r *http.Request) bool {
// check if this is a propagated request from another vmselect, by propagate argument.
// - if yes: simply execute and return.
propagate := httputil.GetBool(r, "propagate")
if !propagate {
resetRollupResultCaches()
return true
}
// - if no: it's manual request and need to propagate to other vmselect(s).
resetRollupResultCachesAndPropagate()
return true
}
// GetResetCacheAuthKey returns resetCacheAuthKey value in *Password.
func GetResetCacheAuthKey() *flagutil.Password {
return resetCacheAuthKey
}
func resetRollupResultCaches() {
resetRollupResultCacheCalls.Inc()
// Reset local cache before checking whether selectNodes list is empty.
// This guarantees that at least local cache is reset if selectNodes list is empty.
promql.ResetRollupResultCache()
}
func resetRollupResultCachesAndPropagate() {
resetRollupResultCaches()
if len(*selectNodes) == 0 {
logger.Warnf("missing -selectNode flag, cache reset request wont be propagated to the other vmselect nodes." +
"This can be fixed by enumerating all the vmselect node addresses in `-selectNode` command line flag. " +
" For example: -selectNode=select-addr-1:8481,select-addr-2:8481")
return
}
rcAuthKey := GetResetCacheAuthKey().Get()
for _, selectNode := range *selectNodes {
normalizedAddr, err := netutil.NormalizeAddr(selectNode, 8481)
if err != nil {
logger.Fatalf("cannot normalize -selectNode=%q: %s", selectNode, err)
}
selectNode = normalizedAddr
callURL := fmt.Sprintf("http://%s/internal/resetRollupResultCache", selectNode)
resp, err := httpClient.Get(callURL)
req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s/internal/resetRollupResultCache", selectNode), nil)
if err != nil {
logger.Errorf("error when accessing %q: %s", callURL, err)
logger.Errorf("cannot create cache reset request for %q: %s", selectNode, err)
resetRollupResultCacheErrors.Inc()
continue
}
// usually `-search.resetCacheAuthKey` is set to the same on each vmselect. it's good to propagate with this argument.
if rcAuthKey != "" {
q := req.URL.Query()
q.Add("authKey", rcAuthKey)
req.URL.RawQuery = q.Encode()
}
resp, err := httpClient.Do(req)
if err != nil {
logger.Errorf("error when accessing %q: %s", req.URL.String(), err)
resetRollupResultCacheErrors.Inc()
continue
}
if resp.StatusCode != http.StatusOK {
_ = resp.Body.Close()
logger.Errorf("unexpected status code at %q; got %d; want %d", callURL, resp.StatusCode, http.StatusOK)
logger.Errorf("unexpected status code at %q; got %d; want %d", req.URL.String(), resp.StatusCode, http.StatusOK)
resetRollupResultCacheErrors.Inc()
continue
}

View File

@@ -35,6 +35,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
* BUGFIX: [vmalert](https://docs.victoriametrics.com/victoriametrics/vmalert/),[vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/),[vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) and [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/): fix rare unbounded shutdown delay when config reload takes longer than `-configCheckInterval`. See [#11107](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11107). Thanks to @PleasingFungus for contribution.
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): fix corrupted metrics metadata when a response contains multiple rows. See [#11115](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11115). Thanks for @fxrlv for the contribution.
* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): do not fail backup list if directory is absent while using `fs://` destination to align with other protocols. See [6c3c548](https://github.com/VictoriaMetrics/VictoriaMetrics/commit/6c3c548ddb0385b749e731f52276f130e2a4e4a8)
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): propagate cache reset operation to `selectNode` when `/internal/resetRollupResultCache` is called. Previously, the propagation only happened when the `delete_series` API was called. See [#11112](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11112).
## [v1.145.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.145.0)

View File

@@ -622,11 +622,13 @@ curl -Is http://localhost:8428/internal/resetRollupResultCache
Cluster version of VictoriaMetrics:
```sh
curl -Is http://<vmselect>:8481/internal/resetRollupResultCache
curl -Is http://<vmselect>:8481/internal/resetRollupResultCache?propagate=1
```
vmselect will propagate this call to the rest of the vmselects listed in its `-selectNode` cmd-line flag. If this
flag isn't set, then cache need to be purged from each vmselect individually.
vmselect will propagate this call to the rest of the vmselects listed in its `-selectNode` cmd-line flag when `propagate=1` argument is set.
If this flag or the `propagate` argument isn't set, then cache need to be purged from each vmselect individually.
If `-search.resetCacheAuthKey` is set, it will be attached to the propagation request as query argument.
### TCP and UDP