app/vmctl: return errors instead of silently skipping unexpected OpenTSDB responses

Previously 
- `GetData` in the OpenTSDB client was returning empty `Metric{}` with
`nil` error for several conditions (multiple series returned, aggregate
tags present, `modifyData` failures), causing `vmctl opentsdb` to
silently drop series during migration

 This commit changes these silent return paths to return proper errors with
descriptive messages including the query string, so operators can detect
and diagnose partial migrations.

Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10797
This commit is contained in:
cubic-dev-ai[bot]
2026-04-22 11:28:55 +02:00
committed by GitHub
parent a3df0f890b
commit 2c262c5ef6
6 changed files with 420 additions and 50 deletions

View File

@@ -8,10 +8,10 @@ import (
"time"
vmetrics "github.com/VictoriaMetrics/metrics"
"github.com/cheggaaa/pb/v3"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
"github.com/cheggaaa/pb/v3"
)
type otsdbProcessor struct {
@@ -89,9 +89,6 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
// we're going to make serieslist * queryRanges queries, so we should represent that in the progress bar
otsdbSeriesTotal.Add(len(serieslist) * queryRanges)
bar := pb.StartNew(len(serieslist) * queryRanges)
defer func(bar *pb.ProgressBar) {
bar.Finish()
}(bar)
var wg sync.WaitGroup
for range op.otsdbcc {
wg.Go(func() {
@@ -106,41 +103,22 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
}
})
}
/*
Loop through all series for this metric, processing all retentions and time ranges
requested. This loop is our primary "collect data from OpenTSDB loop" and should
be async, sending data to VictoriaMetrics over time.
runErr := op.sendQueries(ctx, serieslist, seriesCh, errCh, startTime)
The idea with having the select at the inner-most loop is to ensure quick
short-circuiting on error.
*/
for _, series := range serieslist {
for _, rt := range op.oc.Retentions {
for _, tr := range rt.QueryRanges {
select {
case otsdbErr := <-errCh:
return fmt.Errorf("opentsdb error: %s", otsdbErr)
case vmErr := <-op.im.Errors():
otsdbErrorsTotal.Inc()
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
case seriesCh <- queryObj{
Tr: tr, StartTime: startTime,
Series: series, Rt: opentsdb.RetentionMeta{
FirstOrder: rt.FirstOrder, SecondOrder: rt.SecondOrder, AggTime: rt.AggTime}}:
}
}
}
}
// Drain channels per metric
// Always drain channels and wait for workers to prevent goroutine leaks
close(seriesCh)
wg.Wait()
close(errCh)
// check for any lingering errors on the query side
for otsdbErr := range errCh {
return fmt.Errorf("import process failed: \n%s", otsdbErr)
if runErr == nil {
runErr = fmt.Errorf("import process failed: \n%s", otsdbErr)
}
}
bar.Finish()
if runErr != nil {
return runErr
}
log.Print(op.im.Stats())
}
op.im.Close()
@@ -155,6 +133,34 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
return nil
}
// sendQueries iterates over all series and retention ranges, sending queries to workers.
// It returns early if ctx is canceled or an error is received.
func (op *otsdbProcessor) sendQueries(ctx context.Context, serieslist []opentsdb.Meta, seriesCh chan<- queryObj, errCh <-chan error, startTime int64) error {
for _, series := range serieslist {
for _, rt := range op.oc.Retentions {
for _, tr := range rt.QueryRanges {
select {
case <-ctx.Done():
return fmt.Errorf("context canceled: %s", ctx.Err())
case otsdbErr := <-errCh:
otsdbErrorsTotal.Inc()
return fmt.Errorf("opentsdb error: %s", otsdbErr)
case vmErr := <-op.im.Errors():
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
case seriesCh <- queryObj{
Tr: tr, StartTime: startTime,
Series: series, Rt: opentsdb.RetentionMeta{
FirstOrder: rt.FirstOrder,
SecondOrder: rt.SecondOrder,
AggTime: rt.AggTime,
}}:
}
}
}
}
return nil
}
func (op *otsdbProcessor) do(s queryObj) error {
start := s.StartTime - s.Tr.Start
end := s.StartTime - s.Tr.End
@@ -163,6 +169,7 @@ func (op *otsdbProcessor) do(s queryObj) error {
return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
}
if len(data.Timestamps) < 1 || len(data.Values) < 1 {
log.Printf("no data found for %v in %v:%v...skipping", s.Series, s.Rt, s.Tr)
return nil
}
labels := make([]vm.LabelPair, 0, len(data.Tags))

View File

@@ -108,10 +108,10 @@ func (c Client) FindMetrics(q string) ([]string, error) {
if err != nil {
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
}
defer func() { _ = resp.Body.Close() }()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("could not retrieve metric data from %q: %s", q, err)
@@ -130,12 +130,12 @@ func (c Client) FindSeries(metric string) ([]Meta, error) {
q := fmt.Sprintf("%s/api/search/lookup?m=%s&limit=%d", c.Addr, metric, c.Limit)
resp, err := c.c.Get(q)
if err != nil {
return nil, fmt.Errorf("failed to set GET request to %q: %s", q, err)
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
}
defer func() { _ = resp.Body.Close() }()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
@@ -185,6 +185,7 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
if err != nil {
return Metric{}, fmt.Errorf("failed to send GET request to %q: %s", q, err)
}
defer func() { _ = resp.Body.Close() }()
/*
There are three potential failures here, none of which should kill the entire
migration run:
@@ -196,7 +197,6 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
log.Printf("bad response code from OpenTSDB query %v for %q...skipping", resp.StatusCode, q)
return Metric{}, nil
}
defer func() { _ = resp.Body.Close() }()
body, err := io.ReadAll(resp.Body)
if err != nil {
log.Println("couldn't read response body from OpenTSDB query...skipping")
@@ -239,27 +239,20 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
In all "bad" cases, we don't end the migration, we just don't process that particular message
*/
if len(output) < 1 {
// no results returned...return an empty object without error
return Metric{}, nil
}
if len(output) > 1 {
// multiple series returned for a single query. We can't process this right, so...
return Metric{}, nil
return Metric{}, fmt.Errorf("unexpected number of series returned: %d for query %q; expected 1", len(output), q)
}
if len(output[0].AggregateTags) > 0 {
// This failure means we've suppressed potential series somehow...
return Metric{}, nil
return Metric{}, fmt.Errorf("aggregate tags %v present in response for query %q; series may be suppressed", output[0].AggregateTags, q)
}
data := Metric{}
data.Metric = output[0].Metric
data.Tags = output[0].Tags
/*
We evaluate data for correctness before formatting the actual values
to skip a little bit of time if the series has invalid formatting
*/
data, err = modifyData(data, c.Normalize)
if err != nil {
return Metric{}, nil
return Metric{}, fmt.Errorf("failed to convert metric data for query %q: %w", q, err)
}
/*

View File

@@ -32,7 +32,7 @@ func convertDuration(duration string) (time.Duration, error) {
var err error
var timeValue int
if strings.HasSuffix(duration, "y") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "y"))
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "y"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
@@ -42,7 +42,7 @@ func convertDuration(duration string) (time.Duration, error) {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else if strings.HasSuffix(duration, "w") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "w"))
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "w"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
@@ -52,7 +52,7 @@ func convertDuration(duration string) (time.Duration, error) {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
} else if strings.HasSuffix(duration, "d") {
timeValue, err = strconv.Atoi(strings.Trim(duration, "d"))
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "d"))
if err != nil {
return 0, fmt.Errorf("invalid time range: %q", duration)
}
@@ -95,6 +95,9 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
if !msecTime {
queryLength = queryLength / 1000
}
if queryLength <= 0 {
return Retention{}, fmt.Errorf("ttl %q resolves to non-positive query range %d; use a larger duration", chunks[2], queryLength)
}
queryRange := queryLength
// bump by the offset so we don't look at empty ranges any time offset > ttl
queryLength += offset
@@ -138,16 +141,29 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
2. we discover the actual size of each "chunk"
This is second division step
*/
querySize = int64(queryRange / (queryRange / (rowLength * 4)))
divisor := queryRange / (rowLength * 4)
if divisor == 0 {
querySize = queryRange
} else {
querySize = queryRange / divisor
}
} else {
/*
Unless the aggTime (how long a range of data we're requesting per individual point)
is greater than the row size. Then we'll need to use that to determine
how big each individual query should be
*/
querySize = int64(queryRange / (queryRange / (aggTime * 4)))
divisor := queryRange / (aggTime * 4)
if divisor == 0 {
querySize = queryRange
} else {
querySize = queryRange / divisor
}
}
if querySize <= 0 {
return Retention{}, fmt.Errorf("computed non-positive querySize=%d for retention %q; check parameters", querySize, retention)
}
var timeChunks []TimeRange
var i int64
for i = offset; i <= queryLength; i = i + querySize {

View File

@@ -0,0 +1,186 @@
package tests
import (
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"sort"
"strconv"
"strings"
"testing"
)
// openTSDBPoint is a single data point served by the mock OpenTSDB server.
type openTSDBPoint struct {
Metric string
Tags map[string]string
Timestamp int64
Value float64
}
// openTSDBMockServer implements the minimal subset of the OpenTSDB HTTP API
// used by vmctl opentsdb: /api/suggest, /api/search/lookup, /api/query.
type openTSDBMockServer struct {
server *httptest.Server
points []openTSDBPoint
}
// newOpenTSDBMockServer starts an httptest server serving the given points.
func newOpenTSDBMockServer(t *testing.T, points []openTSDBPoint) *openTSDBMockServer {
t.Helper()
s := &openTSDBMockServer{points: points}
mux := http.NewServeMux()
mux.HandleFunc("/api/suggest", s.handleSuggest)
mux.HandleFunc("/api/search/lookup", s.handleLookup)
mux.HandleFunc("/api/query", s.handleQuery)
s.server = httptest.NewServer(mux)
return s
}
// close shuts down the server.
func (s *openTSDBMockServer) close() { s.server.Close() }
// httpAddr returns the server URL.
func (s *openTSDBMockServer) httpAddr() string { return s.server.URL }
// handleSuggest serves https://opentsdb.net/docs/build/html/api_http/suggest.html
func (s *openTSDBMockServer) handleSuggest(w http.ResponseWriter, r *http.Request) {
q := r.URL.Query().Get("q")
seen := make(map[string]bool, len(s.points))
var out []string
for _, p := range s.points {
if seen[p.Metric] {
continue
}
if q != "" && !strings.Contains(p.Metric, q) {
continue
}
seen[p.Metric] = true
out = append(out, p.Metric)
}
_ = json.NewEncoder(w).Encode(out)
}
// handleLookup serves https://opentsdb.net/docs/build/html/api_http/search/lookup.html
func (s *openTSDBMockServer) handleLookup(w http.ResponseWriter, r *http.Request) {
metric := r.URL.Query().Get("m")
type meta struct {
Metric string `json:"metric"`
Tags map[string]string `json:"tags"`
}
seen := make(map[string]bool, len(s.points))
var results []meta
for _, p := range s.points {
if p.Metric != metric {
continue
}
key := tagsKey(p.Tags)
if seen[key] {
continue
}
seen[key] = true
results = append(results, meta{p.Metric, p.Tags})
}
_ = json.NewEncoder(w).Encode(map[string]any{
"type": "LOOKUP",
"metric": metric,
"results": results,
})
}
// handleQuery serves https://opentsdb.net/docs/build/html/api_http/query/index.html
func (s *openTSDBMockServer) handleQuery(w http.ResponseWriter, r *http.Request) {
m := r.URL.Query().Get("m")
metric, tagFilter, ok := parseQuery(m)
if !ok {
http.Error(w, "bad query param", http.StatusBadRequest)
return
}
start, err := strconv.ParseInt(r.URL.Query().Get("start"), 10, 64)
if err != nil {
http.Error(w, "bad start param", http.StatusBadRequest)
return
}
end, err := strconv.ParseInt(r.URL.Query().Get("end"), 10, 64)
if err != nil {
http.Error(w, "bad end param", http.StatusBadRequest)
return
}
type resp struct {
Metric string `json:"metric"`
Tags map[string]string `json:"tags"`
AggregateTags []string `json:"aggregateTags"`
Dps map[string]float64 `json:"dps"`
}
grouped := make(map[string]*resp, len(s.points))
for _, p := range s.points {
if p.Metric != metric {
continue
}
if !matchTags(p.Tags, tagFilter) {
continue
}
if p.Timestamp < start || p.Timestamp > end {
continue
}
key := tagsKey(p.Tags)
if _, exists := grouped[key]; !exists {
grouped[key] = &resp{
Metric: p.Metric,
Tags: p.Tags,
AggregateTags: []string{},
Dps: map[string]float64{},
}
}
grouped[key].Dps[fmt.Sprintf("%d", p.Timestamp)] = p.Value
}
out := make([]*resp, 0, len(grouped))
for _, v := range grouped {
out = append(out, v)
}
_ = json.NewEncoder(w).Encode(out)
}
// parseQuery parses the OpenTSDB m= query parameter.
// Format: "<agg>:<bucket>-<agg>-none:<metric>{k=v,k=v}"
func parseQuery(m string) (string, map[string]string, bool) {
parts := strings.SplitN(m, ":", 3)
if len(parts) != 3 {
return "", nil, false
}
metric, tagStr, _ := strings.Cut(parts[2], "{")
tags := make(map[string]string, 4)
tagStr = strings.TrimSuffix(tagStr, "}")
for _, kv := range strings.Split(tagStr, ",") {
if k, v, ok := strings.Cut(kv, "="); ok {
tags[k] = v
}
}
return metric, tags, true
}
func matchTags(got, filter map[string]string) bool {
for k, v := range filter {
if v == "*" {
continue
}
if got[k] != v {
return false
}
}
return true
}
func tagsKey(tags map[string]string) string {
keys := make([]string, 0, len(tags))
for k := range tags {
keys = append(keys, k)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, k := range keys {
parts = append(parts, k+"="+tags[k])
}
return strings.Join(parts, ",")
}

View File

@@ -0,0 +1,167 @@
package tests
import (
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/VictoriaMetrics/VictoriaMetrics/apptest"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
)
func TestSingleVmctlOpenTSDBProtocol(t *testing.T) {
fs.MustRemoveDir(t.Name())
tc := apptest.NewTestCase(t)
defer tc.Stop()
vmsingleDst := tc.MustStartDefaultVmsingle()
vmAddr := fmt.Sprintf("http://%s/", vmsingleDst.HTTPAddr())
// Generate 60 points at 1-minute intervals starting 2 hours ago.
// This ensures data falls within vmctl's default query window (now - retention).
baseTS := time.Now().Add(-2 * time.Hour).Truncate(time.Minute).Unix()
points := make([]openTSDBPoint, 0, 60)
for i := range 60 {
points = append(points, openTSDBPoint{
Metric: "test.cpu",
Tags: map[string]string{"host": "h1", "env": "prod"},
Timestamp: baseTS + int64(i*60),
Value: float64(i),
})
}
otsdb := newOpenTSDBMockServer(t, points)
defer otsdb.close()
vmctlFlags := []string{
`opentsdb`,
`--otsdb-addr=` + otsdb.httpAddr(),
`--vm-addr=` + vmAddr,
`--otsdb-retentions=ssum-1m-avg:1d:1d`,
`--otsdb-filters=test`,
`--otsdb-normalize`,
`--disable-progress-bar=true`,
`-s`,
}
testOpenTSDBProtocol(tc, vmsingleDst, vmctlFlags, points, "test_cpu", baseTS)
}
func TestClusterVmctlOpenTSDBProtocol(t *testing.T) {
fs.MustRemoveDir(t.Name())
tc := apptest.NewTestCase(t)
defer tc.Stop()
cluster := tc.MustStartDefaultCluster()
vmAddr := fmt.Sprintf("http://%s/", cluster.Vminsert.HTTPAddr())
// Generate 60 points at 1-minute intervals starting 2 hours ago.
baseTS := time.Now().Add(-2 * time.Hour).Truncate(time.Minute).Unix()
points := make([]openTSDBPoint, 0, 60)
for i := range 60 {
points = append(points, openTSDBPoint{
Metric: "test.mem",
Tags: map[string]string{"host": "h1"},
Timestamp: baseTS + int64(i*60),
Value: float64(i * 2),
})
}
otsdb := newOpenTSDBMockServer(t, points)
defer otsdb.close()
vmctlFlags := []string{
`opentsdb`,
`--otsdb-addr=` + otsdb.httpAddr(),
`--vm-addr=` + vmAddr,
`--otsdb-retentions=sum-1m-avg:1d:1d`,
`--otsdb-filters=test`,
`--otsdb-normalize`,
`--disable-progress-bar=true`,
`--vm-account-id=0`,
`-s`,
}
testOpenTSDBProtocol(tc, cluster, vmctlFlags, points, "test_mem", baseTS)
}
func testOpenTSDBProtocol(
tc *apptest.TestCase,
queries apptest.PrometheusWriteQuerier,
vmctlFlags []string,
points []openTSDBPoint,
vmMetricName string,
baseTS int64,
) {
t := tc.T()
t.Helper()
// Build dynamic time range covering all data points with 1-hour padding.
queryStart := time.Unix(baseTS-3600, 0).UTC().Format(time.RFC3339)
queryEnd := time.Unix(baseTS+7200, 0).UTC().Format(time.RFC3339)
cmpOpt := cmpopts.IgnoreFields(apptest.PrometheusAPIV1QueryResponse{}, "Status", "Data.ResultType")
got := queries.PrometheusAPIV1Query(t, `{__name__=~".*"}`, apptest.QueryOpts{
Step: "5m",
Time: queryStart,
})
want := apptest.NewPrometheusAPIV1QueryResponse(t, `{"data":{"result":[]}}`)
if diff := cmp.Diff(want, got, cmpOpt); diff != "" {
t.Errorf("unexpected response (-want, +got):\n%s", diff)
}
tc.MustStartVmctl("vmctl", vmctlFlags)
queries.ForceFlush(t)
expected := buildExpectedOpenTSDBResult(points, vmMetricName)
tc.Assert(&apptest.AssertOptions{
Retries: 300,
Msg: `unexpected metrics stored via opentsdb protocol`,
Got: func() any {
r := queries.PrometheusAPIV1Export(t, fmt.Sprintf(`{__name__=%q}`, vmMetricName), apptest.QueryOpts{
Start: queryStart,
End: queryEnd,
})
r.Sort()
return r.Data.Result
},
Want: expected,
CmpOpts: []cmp.Option{
cmpopts.IgnoreFields(apptest.PrometheusAPIV1QueryResponse{}, "Status", "Data.ResultType"),
},
})
}
func buildExpectedOpenTSDBResult(points []openTSDBPoint, vmMetricName string) []*apptest.QueryResult {
grouped := map[string]*apptest.QueryResult{}
for _, p := range points {
metric := map[string]string{"__name__": vmMetricName}
for k, v := range p.Tags {
metric[k] = v
}
key := tagsKey(metric)
if _, ok := grouped[key]; !ok {
grouped[key] = &apptest.QueryResult{Metric: metric}
}
grouped[key].Samples = append(grouped[key].Samples, &apptest.Sample{
Timestamp: p.Timestamp * 1000,
Value: p.Value,
})
}
out := make([]*apptest.QueryResult, 0, len(grouped))
for _, v := range grouped {
out = append(out, v)
}
resp := apptest.PrometheusAPIV1QueryResponse{
Data: &apptest.QueryData{Result: out},
}
resp.Sort()
return resp.Data.Result
}

View File

@@ -27,6 +27,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
## tip
* FEATURE: all VictoriaMetrics components: add support for reading cpu/memory limits configured via [systemd slices](https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html). Previously, only limits set directly on the process's own cgroup were detected. See [#10635](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10635). Thanks to @andriibeee for the contribution.
* FEATURE: [vmctl](https://docs.victoriametrics.com/victoriametrics/vmctl/): improve error handling at opentsdb migration. See [#10797](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10797)
* FEATURE: [vmui](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#vmui): now `Run query` link on the Alerting Rules page correctly propagates the alerts interval and evaluation time. See [#10366](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10366).
* FEATURE: [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/rules): add new `MetricNameStatsCacheUtilizationIsTooHigh` alerting rule to track overutilization of [Metric names usage stats tracker](https://docs.victoriametrics.com/victoriametrics/#track-ingested-metrics-usage) (used in [Cardinality Explorer](https://docs.victoriametrics.com/victoriametrics/#cardinality-explorer)). See [#10840](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10840).
* FEATURE: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): add `vm_streamaggr_counter_resets_total` metric for `total*`, `increase*` and `rate*` outputs that is useful for aggregation behaviour tracking. These metrics help to identify issues described in [Troubleshooting: counter resets](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#counter-resets).