app/vmctl: return errors instead of silently skipping unexpected OpenTSDB responses

Previously - `GetData` in the OpenTSDB client was returning empty `Metric{}` with `nil` error for several conditions (multiple series returned, aggregate tags present, `modifyData` failures), causing `vmctl opentsdb` to silently drop series during migration This commit changes these silent return paths to return proper errors with descriptive messages including the query string, so operators can detect and diagnose partial migrations. Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10797
2026-05-17 08:36:55 +03:00 · 2026-04-22 11:28:55 +02:00
parent a3df0f890b
commit 2c262c5ef6
6 changed files with 420 additions and 50 deletions
--- a/app/vmctl/opentsdb.go
+++ b/app/vmctl/opentsdb.go
@@ -8,10 +8,10 @@ import (
 	"time"

 	vmetrics "github.com/VictoriaMetrics/metrics"
+	"github.com/cheggaaa/pb/v3"

 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
-	"github.com/cheggaaa/pb/v3"
 )

 type otsdbProcessor struct {
@@ -89,9 +89,6 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
 		// we're going to make serieslist * queryRanges queries, so we should represent that in the progress bar
 		otsdbSeriesTotal.Add(len(serieslist) * queryRanges)
 		bar := pb.StartNew(len(serieslist) * queryRanges)
-		defer func(bar *pb.ProgressBar) {
-			bar.Finish()
-		}(bar)
 		var wg sync.WaitGroup
 		for range op.otsdbcc {
 			wg.Go(func() {
@@ -106,41 +103,22 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
 				}
 			})
 		}
-		/*
-			Loop through all series for this metric, processing all retentions and time ranges
-			requested. This loop is our primary "collect data from OpenTSDB loop" and should
-			be async, sending data to VictoriaMetrics over time.
+		runErr := op.sendQueries(ctx, serieslist, seriesCh, errCh, startTime)

-			The idea with having the select at the inner-most loop is to ensure quick
-			short-circuiting on error.
-		*/
-		for _, series := range serieslist {
-			for _, rt := range op.oc.Retentions {
-				for _, tr := range rt.QueryRanges {
-					select {
-					case otsdbErr := <-errCh:
-						return fmt.Errorf("opentsdb error: %s", otsdbErr)
-					case vmErr := <-op.im.Errors():
-						otsdbErrorsTotal.Inc()
-						return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
-					case seriesCh <- queryObj{
-						Tr: tr, StartTime: startTime,
-						Series: series, Rt: opentsdb.RetentionMeta{
-							FirstOrder: rt.FirstOrder, SecondOrder: rt.SecondOrder, AggTime: rt.AggTime}}:
-					}
-				}
-			}
-		}
-
-		// Drain channels per metric
+		// Always drain channels and wait for workers to prevent goroutine leaks
 		close(seriesCh)
 		wg.Wait()
 		close(errCh)
 		// check for any lingering errors on the query side
 		for otsdbErr := range errCh {
-			return fmt.Errorf("import process failed: \n%s", otsdbErr)
+			if runErr == nil {
+				runErr = fmt.Errorf("import process failed: \n%s", otsdbErr)
+			}
 		}
 		bar.Finish()
+		if runErr != nil {
+			return runErr
+		}
 		log.Print(op.im.Stats())
 	}
 	op.im.Close()
@@ -155,6 +133,34 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
 	return nil
 }

+// sendQueries iterates over all series and retention ranges, sending queries to workers.
+// It returns early if ctx is canceled or an error is received.
+func (op *otsdbProcessor) sendQueries(ctx context.Context, serieslist []opentsdb.Meta, seriesCh chan<- queryObj, errCh <-chan error, startTime int64) error {
+	for _, series := range serieslist {
+		for _, rt := range op.oc.Retentions {
+			for _, tr := range rt.QueryRanges {
+				select {
+				case <-ctx.Done():
+					return fmt.Errorf("context canceled: %s", ctx.Err())
+				case otsdbErr := <-errCh:
+					otsdbErrorsTotal.Inc()
+					return fmt.Errorf("opentsdb error: %s", otsdbErr)
+				case vmErr := <-op.im.Errors():
+					return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
+				case seriesCh <- queryObj{
+					Tr: tr, StartTime: startTime,
+					Series: series, Rt: opentsdb.RetentionMeta{
+						FirstOrder:  rt.FirstOrder,
+						SecondOrder: rt.SecondOrder,
+						AggTime:     rt.AggTime,
+					}}:
+				}
+			}
+		}
+	}
+	return nil
+}
+
 func (op *otsdbProcessor) do(s queryObj) error {
 	start := s.StartTime - s.Tr.Start
 	end := s.StartTime - s.Tr.End
@@ -163,6 +169,7 @@ func (op *otsdbProcessor) do(s queryObj) error {
 		return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
 	}
 	if len(data.Timestamps) < 1 || len(data.Values) < 1 {
+		log.Printf("no data found for %v in %v:%v...skipping", s.Series, s.Rt, s.Tr)
 		return nil
 	}
 	labels := make([]vm.LabelPair, 0, len(data.Tags))
--- a/app/vmctl/opentsdb/opentsdb.go
+++ b/app/vmctl/opentsdb/opentsdb.go
@@ -108,10 +108,10 @@ func (c Client) FindMetrics(q string) ([]string, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
 	}
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != 200 {
 		return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
 	}
-	defer func() { _ = resp.Body.Close() }()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, fmt.Errorf("could not retrieve metric data from %q: %s", q, err)
@@ -130,12 +130,12 @@ func (c Client) FindSeries(metric string) ([]Meta, error) {
 	q := fmt.Sprintf("%s/api/search/lookup?m=%s&limit=%d", c.Addr, metric, c.Limit)
 	resp, err := c.c.Get(q)
 	if err != nil {
-		return nil, fmt.Errorf("failed to set GET request to %q: %s", q, err)
+		return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
 	}
+	defer func() { _ = resp.Body.Close() }()
 	if resp.StatusCode != 200 {
 		return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
 	}
-	defer func() { _ = resp.Body.Close() }()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
@@ -185,6 +185,7 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
 	if err != nil {
 		return Metric{}, fmt.Errorf("failed to send GET request to %q: %s", q, err)
 	}
+	defer func() { _ = resp.Body.Close() }()
 	/*
 		There are three potential failures here, none of which should kill the entire
 		migration run:
@@ -196,7 +197,6 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
 		log.Printf("bad response code from OpenTSDB query %v for %q...skipping", resp.StatusCode, q)
 		return Metric{}, nil
 	}
-	defer func() { _ = resp.Body.Close() }()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		log.Println("couldn't read response body from OpenTSDB query...skipping")
@@ -239,27 +239,20 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
 		In all "bad" cases, we don't end the migration, we just don't process that particular message
 	*/
 	if len(output) < 1 {
-		// no results returned...return an empty object without error
 		return Metric{}, nil
 	}
 	if len(output) > 1 {
-		// multiple series returned for a single query. We can't process this right, so...
-		return Metric{}, nil
+		return Metric{}, fmt.Errorf("unexpected number of series returned: %d for query %q; expected 1", len(output), q)
 	}
 	if len(output[0].AggregateTags) > 0 {
-		// This failure means we've suppressed potential series somehow...
-		return Metric{}, nil
+		return Metric{}, fmt.Errorf("aggregate tags %v present in response for query %q; series may be suppressed", output[0].AggregateTags, q)
 	}
 	data := Metric{}
 	data.Metric = output[0].Metric
 	data.Tags = output[0].Tags
-	/*
-		We evaluate data for correctness before formatting the actual values
-		to skip a little bit of time if the series has invalid formatting
-	*/
 	data, err = modifyData(data, c.Normalize)
 	if err != nil {
-		return Metric{}, nil
+		return Metric{}, fmt.Errorf("failed to convert metric data for query %q: %w", q, err)
 	}

 	/*
--- a/app/vmctl/opentsdb/parser.go
+++ b/app/vmctl/opentsdb/parser.go
@@ -32,7 +32,7 @@ func convertDuration(duration string) (time.Duration, error) {
 	var err error
 	var timeValue int
 	if strings.HasSuffix(duration, "y") {
-		timeValue, err = strconv.Atoi(strings.Trim(duration, "y"))
+		timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "y"))
 		if err != nil {
 			return 0, fmt.Errorf("invalid time range: %q", duration)
 		}
@@ -42,7 +42,7 @@ func convertDuration(duration string) (time.Duration, error) {
 			return 0, fmt.Errorf("invalid time range: %q", duration)
 		}
 	} else if strings.HasSuffix(duration, "w") {
-		timeValue, err = strconv.Atoi(strings.Trim(duration, "w"))
+		timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "w"))
 		if err != nil {
 			return 0, fmt.Errorf("invalid time range: %q", duration)
 		}
@@ -52,7 +52,7 @@ func convertDuration(duration string) (time.Duration, error) {
 			return 0, fmt.Errorf("invalid time range: %q", duration)
 		}
 	} else if strings.HasSuffix(duration, "d") {
-		timeValue, err = strconv.Atoi(strings.Trim(duration, "d"))
+		timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "d"))
 		if err != nil {
 			return 0, fmt.Errorf("invalid time range: %q", duration)
 		}
@@ -95,6 +95,9 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
 	if !msecTime {
 		queryLength = queryLength / 1000
 	}
+	if queryLength <= 0 {
+		return Retention{}, fmt.Errorf("ttl %q resolves to non-positive query range %d; use a larger duration", chunks[2], queryLength)
+	}
 	queryRange := queryLength
 	// bump by the offset so we don't look at empty ranges any time offset > ttl
 	queryLength += offset
@@ -138,16 +141,29 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
 			2. we discover the actual size of each "chunk"
 			   This is second division step
 		*/
-		querySize = int64(queryRange / (queryRange / (rowLength * 4)))
+		divisor := queryRange / (rowLength * 4)
+		if divisor == 0 {
+			querySize = queryRange
+		} else {
+			querySize = queryRange / divisor
+		}
 	} else {
 		/*
 			Unless the aggTime (how long a range of data we're requesting per individual point)
 			is greater than the row size. Then we'll need to use that to determine
 			how big each individual query should be
 		*/
-		querySize = int64(queryRange / (queryRange / (aggTime * 4)))
+		divisor := queryRange / (aggTime * 4)
+		if divisor == 0 {
+			querySize = queryRange
+		} else {
+			querySize = queryRange / divisor
+		}
 	}

+	if querySize <= 0 {
+		return Retention{}, fmt.Errorf("computed non-positive querySize=%d for retention %q; check parameters", querySize, retention)
+	}
 	var timeChunks []TimeRange
 	var i int64
 	for i = offset; i <= queryLength; i = i + querySize {