From de2bc4237a8b174fe01f733428718c9c97e0a6b8 Mon Sep 17 00:00:00 2001 From: JAYICE <1185430411@qq.com> Date: Fri, 3 Apr 2026 16:26:58 +0800 Subject: [PATCH] lib/backup/s3: retry the requests that failed with unexpected EOF When the network between client and s3 server is unstable, the client may encounter temporary io.EOF errors when reading the response from s3 server. Currently, the s3 sdk in vmbackup uses the default retry policy. However, this default retry policy won't retry when s3 sdk meet unexpected EOF. This means that the temporary unexpected EOF error will cause the backup task to fail. fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10699 --- docs/victoriametrics/changelog/CHANGELOG.md | 2 ++ lib/backup/s3remote/s3.go | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/victoriametrics/changelog/CHANGELOG.md b/docs/victoriametrics/changelog/CHANGELOG.md index e826a86977..87ae844829 100644 --- a/docs/victoriametrics/changelog/CHANGELOG.md +++ b/docs/victoriametrics/changelog/CHANGELOG.md @@ -29,6 +29,8 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel * FEATURE: introduce `vm_filestream_fsync_duration_seconds_total` and `vm_filestream_fsync_calls_total` metrics, which can be used for detecting slow storage if it cannot keep up with the current data ingestion rate. See [#10432](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10432). Thanks to @mehrdadbn9 for the contribution. * FEATURE: [vmctl](https://docs.victoriametrics.com/victoriametrics/vmctl/): add dedicated `thanos` mode for [migrating data from Thanos](https://docs.victoriametrics.com/victoriametrics/vmctl/thanos/). This mode supports both raw and downsampled Thanos blocks, including all aggregate types (count, sum, min, max, counter). Each aggregate is imported as a separate metric with resolution and aggregate type suffixes (e.g., `metric_name:5m:count`). The new mode uses `--thanos-*` prefixed flags: `--thanos-snapshot`, `--thanos-concurrency`, `--thanos-filter-time-start`, `--thanos-filter-time-end`, `--thanos-filter-label`, `--thanos-filter-label-value`, and `--thanos-aggr-types`. See [#9262](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/9262). +* BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): retry the requests that failed with unexpected EOF due to unstable network to S3 service. See [#10699](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10699). + ## [v1.139.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.139.0) Released at 2026-03-27 diff --git a/lib/backup/s3remote/s3.go b/lib/backup/s3remote/s3.go index 2d50f731cb..f43ccf3439 100644 --- a/lib/backup/s3remote/s3.go +++ b/lib/backup/s3remote/s3.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "crypto/tls" + "errors" "fmt" "io" "net/http" @@ -164,7 +165,12 @@ func (fs *FS) Init(ctx context.Context) error { // See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/9280 "ExpiredToken": {}, }, - }) + }, retry.IsErrorRetryableFunc(func(err error) aws.Ternary { + if errors.Is(err, io.ErrUnexpectedEOF) { + return aws.TrueTernary + } + return aws.UnknownTernary + })) }) }), }