mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-17 08:36:55 +03:00
app/vmctl: return errors instead of silently skipping unexpected OpenTSDB responses
Previously
- `GetData` in the OpenTSDB client was returning empty `Metric{}` with
`nil` error for several conditions (multiple series returned, aggregate
tags present, `modifyData` failures), causing `vmctl opentsdb` to
silently drop series during migration
This commit changes these silent return paths to return proper errors with
descriptive messages including the query string, so operators can detect
and diagnose partial migrations.
Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10797
This commit is contained in:
committed by
GitHub
parent
a3df0f890b
commit
2c262c5ef6
@@ -8,10 +8,10 @@ import (
|
||||
"time"
|
||||
|
||||
vmetrics "github.com/VictoriaMetrics/metrics"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/opentsdb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/cheggaaa/pb/v3"
|
||||
)
|
||||
|
||||
type otsdbProcessor struct {
|
||||
@@ -89,9 +89,6 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
|
||||
// we're going to make serieslist * queryRanges queries, so we should represent that in the progress bar
|
||||
otsdbSeriesTotal.Add(len(serieslist) * queryRanges)
|
||||
bar := pb.StartNew(len(serieslist) * queryRanges)
|
||||
defer func(bar *pb.ProgressBar) {
|
||||
bar.Finish()
|
||||
}(bar)
|
||||
var wg sync.WaitGroup
|
||||
for range op.otsdbcc {
|
||||
wg.Go(func() {
|
||||
@@ -106,41 +103,22 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
|
||||
}
|
||||
})
|
||||
}
|
||||
/*
|
||||
Loop through all series for this metric, processing all retentions and time ranges
|
||||
requested. This loop is our primary "collect data from OpenTSDB loop" and should
|
||||
be async, sending data to VictoriaMetrics over time.
|
||||
runErr := op.sendQueries(ctx, serieslist, seriesCh, errCh, startTime)
|
||||
|
||||
The idea with having the select at the inner-most loop is to ensure quick
|
||||
short-circuiting on error.
|
||||
*/
|
||||
for _, series := range serieslist {
|
||||
for _, rt := range op.oc.Retentions {
|
||||
for _, tr := range rt.QueryRanges {
|
||||
select {
|
||||
case otsdbErr := <-errCh:
|
||||
return fmt.Errorf("opentsdb error: %s", otsdbErr)
|
||||
case vmErr := <-op.im.Errors():
|
||||
otsdbErrorsTotal.Inc()
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
|
||||
case seriesCh <- queryObj{
|
||||
Tr: tr, StartTime: startTime,
|
||||
Series: series, Rt: opentsdb.RetentionMeta{
|
||||
FirstOrder: rt.FirstOrder, SecondOrder: rt.SecondOrder, AggTime: rt.AggTime}}:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain channels per metric
|
||||
// Always drain channels and wait for workers to prevent goroutine leaks
|
||||
close(seriesCh)
|
||||
wg.Wait()
|
||||
close(errCh)
|
||||
// check for any lingering errors on the query side
|
||||
for otsdbErr := range errCh {
|
||||
return fmt.Errorf("import process failed: \n%s", otsdbErr)
|
||||
if runErr == nil {
|
||||
runErr = fmt.Errorf("import process failed: \n%s", otsdbErr)
|
||||
}
|
||||
}
|
||||
bar.Finish()
|
||||
if runErr != nil {
|
||||
return runErr
|
||||
}
|
||||
log.Print(op.im.Stats())
|
||||
}
|
||||
op.im.Close()
|
||||
@@ -155,6 +133,34 @@ func (op *otsdbProcessor) run(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// sendQueries iterates over all series and retention ranges, sending queries to workers.
|
||||
// It returns early if ctx is canceled or an error is received.
|
||||
func (op *otsdbProcessor) sendQueries(ctx context.Context, serieslist []opentsdb.Meta, seriesCh chan<- queryObj, errCh <-chan error, startTime int64) error {
|
||||
for _, series := range serieslist {
|
||||
for _, rt := range op.oc.Retentions {
|
||||
for _, tr := range rt.QueryRanges {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("context canceled: %s", ctx.Err())
|
||||
case otsdbErr := <-errCh:
|
||||
otsdbErrorsTotal.Inc()
|
||||
return fmt.Errorf("opentsdb error: %s", otsdbErr)
|
||||
case vmErr := <-op.im.Errors():
|
||||
return fmt.Errorf("import process failed: %s", wrapErr(vmErr, op.isVerbose))
|
||||
case seriesCh <- queryObj{
|
||||
Tr: tr, StartTime: startTime,
|
||||
Series: series, Rt: opentsdb.RetentionMeta{
|
||||
FirstOrder: rt.FirstOrder,
|
||||
SecondOrder: rt.SecondOrder,
|
||||
AggTime: rt.AggTime,
|
||||
}}:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (op *otsdbProcessor) do(s queryObj) error {
|
||||
start := s.StartTime - s.Tr.Start
|
||||
end := s.StartTime - s.Tr.End
|
||||
@@ -163,6 +169,7 @@ func (op *otsdbProcessor) do(s queryObj) error {
|
||||
return fmt.Errorf("failed to collect data for %v in %v:%v :: %v", s.Series, s.Rt, s.Tr, err)
|
||||
}
|
||||
if len(data.Timestamps) < 1 || len(data.Values) < 1 {
|
||||
log.Printf("no data found for %v in %v:%v...skipping", s.Series, s.Rt, s.Tr)
|
||||
return nil
|
||||
}
|
||||
labels := make([]vm.LabelPair, 0, len(data.Tags))
|
||||
|
||||
@@ -108,10 +108,10 @@ func (c Client) FindMetrics(q string) ([]string, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not retrieve metric data from %q: %s", q, err)
|
||||
@@ -130,12 +130,12 @@ func (c Client) FindSeries(metric string) ([]Meta, error) {
|
||||
q := fmt.Sprintf("%s/api/search/lookup?m=%s&limit=%d", c.Addr, metric, c.Limit)
|
||||
resp, err := c.c.Get(q)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to set GET request to %q: %s", q, err)
|
||||
return nil, fmt.Errorf("failed to send GET request to %q: %s", q, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("bad return from OpenTSDB: %d: %v", resp.StatusCode, resp)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not retrieve series data from %q: %s", q, err)
|
||||
@@ -185,6 +185,7 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
|
||||
if err != nil {
|
||||
return Metric{}, fmt.Errorf("failed to send GET request to %q: %s", q, err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
/*
|
||||
There are three potential failures here, none of which should kill the entire
|
||||
migration run:
|
||||
@@ -196,7 +197,6 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
|
||||
log.Printf("bad response code from OpenTSDB query %v for %q...skipping", resp.StatusCode, q)
|
||||
return Metric{}, nil
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Println("couldn't read response body from OpenTSDB query...skipping")
|
||||
@@ -239,27 +239,20 @@ func (c Client) GetData(series Meta, rt RetentionMeta, start int64, end int64, m
|
||||
In all "bad" cases, we don't end the migration, we just don't process that particular message
|
||||
*/
|
||||
if len(output) < 1 {
|
||||
// no results returned...return an empty object without error
|
||||
return Metric{}, nil
|
||||
}
|
||||
if len(output) > 1 {
|
||||
// multiple series returned for a single query. We can't process this right, so...
|
||||
return Metric{}, nil
|
||||
return Metric{}, fmt.Errorf("unexpected number of series returned: %d for query %q; expected 1", len(output), q)
|
||||
}
|
||||
if len(output[0].AggregateTags) > 0 {
|
||||
// This failure means we've suppressed potential series somehow...
|
||||
return Metric{}, nil
|
||||
return Metric{}, fmt.Errorf("aggregate tags %v present in response for query %q; series may be suppressed", output[0].AggregateTags, q)
|
||||
}
|
||||
data := Metric{}
|
||||
data.Metric = output[0].Metric
|
||||
data.Tags = output[0].Tags
|
||||
/*
|
||||
We evaluate data for correctness before formatting the actual values
|
||||
to skip a little bit of time if the series has invalid formatting
|
||||
*/
|
||||
data, err = modifyData(data, c.Normalize)
|
||||
if err != nil {
|
||||
return Metric{}, nil
|
||||
return Metric{}, fmt.Errorf("failed to convert metric data for query %q: %w", q, err)
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -32,7 +32,7 @@ func convertDuration(duration string) (time.Duration, error) {
|
||||
var err error
|
||||
var timeValue int
|
||||
if strings.HasSuffix(duration, "y") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "y"))
|
||||
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "y"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
@@ -42,7 +42,7 @@ func convertDuration(duration string) (time.Duration, error) {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else if strings.HasSuffix(duration, "w") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "w"))
|
||||
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "w"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
@@ -52,7 +52,7 @@ func convertDuration(duration string) (time.Duration, error) {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
} else if strings.HasSuffix(duration, "d") {
|
||||
timeValue, err = strconv.Atoi(strings.Trim(duration, "d"))
|
||||
timeValue, err = strconv.Atoi(strings.TrimSuffix(duration, "d"))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid time range: %q", duration)
|
||||
}
|
||||
@@ -95,6 +95,9 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
|
||||
if !msecTime {
|
||||
queryLength = queryLength / 1000
|
||||
}
|
||||
if queryLength <= 0 {
|
||||
return Retention{}, fmt.Errorf("ttl %q resolves to non-positive query range %d; use a larger duration", chunks[2], queryLength)
|
||||
}
|
||||
queryRange := queryLength
|
||||
// bump by the offset so we don't look at empty ranges any time offset > ttl
|
||||
queryLength += offset
|
||||
@@ -138,16 +141,29 @@ func convertRetention(retention string, offset int64, msecTime bool) (Retention,
|
||||
2. we discover the actual size of each "chunk"
|
||||
This is second division step
|
||||
*/
|
||||
querySize = int64(queryRange / (queryRange / (rowLength * 4)))
|
||||
divisor := queryRange / (rowLength * 4)
|
||||
if divisor == 0 {
|
||||
querySize = queryRange
|
||||
} else {
|
||||
querySize = queryRange / divisor
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
Unless the aggTime (how long a range of data we're requesting per individual point)
|
||||
is greater than the row size. Then we'll need to use that to determine
|
||||
how big each individual query should be
|
||||
*/
|
||||
querySize = int64(queryRange / (queryRange / (aggTime * 4)))
|
||||
divisor := queryRange / (aggTime * 4)
|
||||
if divisor == 0 {
|
||||
querySize = queryRange
|
||||
} else {
|
||||
querySize = queryRange / divisor
|
||||
}
|
||||
}
|
||||
|
||||
if querySize <= 0 {
|
||||
return Retention{}, fmt.Errorf("computed non-positive querySize=%d for retention %q; check parameters", querySize, retention)
|
||||
}
|
||||
var timeChunks []TimeRange
|
||||
var i int64
|
||||
for i = offset; i <= queryLength; i = i + querySize {
|
||||
|
||||
Reference in New Issue
Block a user