From 5a50df3f611d8429f94de47aa6d787112c446d4f Mon Sep 17 00:00:00 2001 From: f41gh7 Date: Fri, 16 Jan 2026 16:18:31 +0100 Subject: [PATCH] lib/storage: increase rotation time for daily metricID cache This is follow-up for c5713a09d3882d26175c46dff66489a881bda6c5 Originally, dateMetricID cache was fully rotate every 20 minutes. It made daily-index pre-creation less efficient and caused CPU usage spikes for index records lookup at midnight. storage pre-fills index records for the next day in 1 hour before night. But this rotation made only last 20 minutes before midnight visible in the cache. This commit changes rotation period from 20 minutes to 2 hours ( 1 hour tick interval). While it could slighlty increase cache memory usage ( in practice it shouldn't be noticeable). It prevents from CPU usage spikes. Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10064 --- docs/victoriametrics/changelog/CHANGELOG.md | 1 + lib/storage/date_metric_id_cache.go | 3 +- lib/storage/storage_synctest_test.go | 49 ++++++++++++--------- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/docs/victoriametrics/changelog/CHANGELOG.md b/docs/victoriametrics/changelog/CHANGELOG.md index c37d30aeb5..853fa590de 100644 --- a/docs/victoriametrics/changelog/CHANGELOG.md +++ b/docs/victoriametrics/changelog/CHANGELOG.md @@ -37,6 +37,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel * FEATURE: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): calculate the lookbehind window as the median of the intervals between the last 20 raw samples within the requested time range for range queries. Previously, this calculation was based on the first 20 samples, using the last 20 samples should improve accuracy for recent data. See [#10281](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10281). * BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): fix configuration reloading for `-remoteWrite.relabelConfig` and `-remoteWrite.urlRelabelConfig` when vmagent is launched with empty files. Previously, if vmagent started with an empty config, subsequent config reloads were ignored. See [#10211](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10211). +* BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): prevent slow ingestion requests and CPU usage spikes during midnight daily-index creation. See [#10064](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10064). * BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/): fix a missing path error for `http://:8428/zabbixconnector/api/v1/history`. See [10214](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10214). * BUGFIX: `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): reduce default value for `storage.vminsertConnsShutdownDuration` flag from `25s` to `10s` seconds. It reduces probability of ungraceful storage shutdown at Kubernetes based environments, which has 30 seconds default graceful termination period value. See [#10273](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10273) * BUGFIX: [vmui](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#vmui): remove legacy `tenantID` query param and use the URL path as the single source of truth for multitenancy. See [#10232](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10232). diff --git a/lib/storage/date_metric_id_cache.go b/lib/storage/date_metric_id_cache.go index 3a9980eef9..ec73171b47 100644 --- a/lib/storage/date_metric_id_cache.go +++ b/lib/storage/date_metric_id_cache.go @@ -220,7 +220,8 @@ func (dmc *dateMetricIDCache) syncLocked() { } func (dmc *dateMetricIDCache) startRotation() { - d := timeutil.AddJitterToDuration(10 * time.Minute) + // 1 hour was chosen based on https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10064#issuecomment-3749046726 + d := timeutil.AddJitterToDuration(time.Hour) ticker := time.NewTicker(d) defer ticker.Stop() for { diff --git a/lib/storage/storage_synctest_test.go b/lib/storage/storage_synctest_test.go index 0f014d86ff..9b2dbec843 100644 --- a/lib/storage/storage_synctest_test.go +++ b/lib/storage/storage_synctest_test.go @@ -463,6 +463,7 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { MaxTimestamp: time.Now().Add(+15 * time.Minute).UnixMilli(), }) s := MustOpenStorage(t.Name(), OpenOptions{}) + defer s.MustClose() s.AddRows(mrs0, defaultPrecisionBits) s.DebugFlush() if got, want := countMetricIDs(t, s, "metric0", today), numSeries; got != want { @@ -484,12 +485,6 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { t.Fatalf("unexpected metric id count for next day: got %d, want %d", got, want) } - // Close the storage and reopen it 15m later instead of keeping it open - // and waiting. This is to make the test faster. Storage has a lot of - // background tasks that are activated every 1-10 seconds and synctest's - // time.Sleep() will wake them up many times. Closing storage before - // sleeping seems to eliminate this. - // // At 23:15 the prefill must work. // // However, the mrs1 timestamps are not within the current hour and @@ -498,9 +493,7 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { // // The mrs2 timestamps are within the current hour so some next day index // entries will be created. - s.MustClose() time.Sleep(15 * time.Minute) // 2000-01-01T23:15:00Z - s = MustOpenStorage(t.Name(), OpenOptions{}) mrs1 := testGenerateMetricRowsWithPrefixForTenantID(rng, accountID, projectID, numSeries, "metric1", TimeRange{ MinTimestamp: time.Now().Add(-30 * time.Minute).UnixMilli(), MaxTimestamp: time.Now().Add(-15 * time.Minute).UnixMilli(), @@ -526,13 +519,7 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { t.Fatalf("unexpected metric id count for next day: got 0, want > 0") } - // Close the storage and reopen it at 23:30. - // - // Since we are now closer to midnight than we were at 23:15, more next - // day entries must be created. - s.MustClose() time.Sleep(15 * time.Minute) // 2000-01-01T23:30:00Z - s = MustOpenStorage(t.Name(), OpenOptions{}) mrs3 := testGenerateMetricRowsWithPrefixForTenantID(rng, accountID, projectID, numSeries, "metric3", TimeRange{ MinTimestamp: time.Now().Add(-15 * time.Minute).UnixMilli(), MaxTimestamp: time.Now().UnixMilli(), @@ -547,13 +534,7 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { t.Fatalf("unexpected metric id count for next day: got %d, want > %d", got30min, got15min) } - // Close the storage and reopen it at 23:45. - // - // Since we are now closer to midnight than we were at 23:30, more next - // day entries must be created. - s.MustClose() time.Sleep(15 * time.Minute) // 2000-01-01T23:45:00Z - s = MustOpenStorage(t.Name(), OpenOptions{}) mrs4 := testGenerateMetricRowsWithPrefixForTenantID(rng, accountID, projectID, numSeries, "metric4", TimeRange{ MinTimestamp: time.Now().Add(-15 * time.Minute).UnixMilli(), MaxTimestamp: time.Now().UnixMilli(), @@ -565,7 +546,33 @@ func TestStorageAddRows_nextDayIndexPrefill(t *testing.T) { t.Fatalf("unexpected metric id count for next day: got %d, want > %d", got45min, got30min) } - s.MustClose() + // Sleep until the next day + // do not close storage, it resets dataMetricID cache and it will result into slow inserts + // since dateMetricID cache is not persisted on-disk + + time.Sleep(35 * time.Minute) // 2000-01-02T00:20:00Z + synctest.Wait() + + // Ingest data for the next day, it must hit dateMetricID cache and + // do not result into significant amount of slow inserts. + var m Metrics + s.UpdateMetrics(&m) + currDaySlowInserts := m.SlowPerDayIndexInserts + mrs3NextDay := testGenerateMetricRowsWithPrefixForTenantID(rng, accountID, projectID, numSeries, "metric3", TimeRange{ + MinTimestamp: time.Now().Add(-5 * time.Minute).UnixMilli(), + MaxTimestamp: time.Now().UnixMilli(), + }) + + s.AddRows(mrs3NextDay, defaultPrecisionBits) + s.DebugFlush() + m.Reset() + s.UpdateMetrics(&m) + nextDaySlowInserts := m.SlowPerDayIndexInserts + slowInserts := nextDaySlowInserts - currDaySlowInserts + if slowInserts >= numSeries { + t.Errorf("unexpected amount of slow inserts: got %d, want < %d", slowInserts, numSeries) + } + }) }