lib/protoparser/prometheus: adds Prometheus 3.0 utf-8 quoted syntax support

This commit properly parses Prometheus 3.0 text exposition format. Which adds new quoted version of metric name and label names with `utf-8` characters.

 See the following doc:
https://github.com/prometheus/proposals/blob/main/proposals/2023-08-21-utf8.md#syntax-examples

Related PR:
https://github.com/VictoriaMetrics/VictoriaMetrics/pull/8692
This commit is contained in:
Ted Possible
2025-05-06 08:48:36 -07:00
committed by GitHub
parent 0e313e5355
commit 1b0d535e61
4 changed files with 166 additions and 31 deletions

View File

@@ -25,6 +25,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
* FEATURE: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmrestore](https://docs.victoriametrics.com/vmrestore/), [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): cancel currently running operation if graceful shutdown was requested. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8554).
* FEATURE: [vmbackupmanager](https://docs.victoriametrics.com/vmbackupmanager/): display completion status in `/api/v1/backups` API response and `backup list` command. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5361).
* FEATURE: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): add `vm_retention_filters_partitions_scheduled` and `vm_retention_filters_partitions_scheduled_size_bytes` gauge metrics to reflect [retention filters](https://docs.victoriametrics.com/#retention-filters) process.
* FEATURE: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), [vmagent](https://docs.victoriametrics.com/vmagent/): add support for Prometheus 3.0 utf-8 quoted labels during scraping. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/8692) for details.
* BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/victoriametrics/vmalert-tool/): fix parsing for (+/-)Inf values and scientific notation in `values` field. Thanks to @evkuzin for [#8847](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/8847).
* BUGFIX: [vmui](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#vmui): use `retentionFilter` flag name in debugging interface to make it consistent with flag definition. Previously, flag name in debugging interface was different from command-line configuration so copying command-line flags for debugging produced an error. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8697).

View File

@@ -238,7 +238,8 @@ func TestNewLabelsFromStringFailure(t *testing.T) {
f(`foo{bar="baz`)
f(`foo{bar="baz"`)
f(`foo{bar="baz",`)
f(`foo{"bar"="baz"}`)
// This will no longer fail with support of Prometheus 3.0 quoted UTF8 labels
//f(`foo{"bar"="baz"}`)
f(`{"bar":"baz"}`)
f(`{bar:"baz"}`)
f(`{bar=~"baz"}`)

View File

@@ -7,9 +7,10 @@ import (
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
"github.com/valyala/fastjson/fastfloat"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// Rows contains parsed Prometheus rows.
@@ -113,7 +114,7 @@ func (r *Row) unmarshal(s string, tagsPool []Tag, noEscapes bool) ([]Tag, error)
s = s[n+1:]
tagsStart := len(tagsPool)
var err error
s, tagsPool, err = unmarshalTags(tagsPool, s, noEscapes)
s, tagsPool, err = r.unmarshalTags(tagsPool, s, noEscapes)
if err != nil {
return tagsPool, fmt.Errorf("cannot unmarshal tags: %w", err)
}
@@ -230,43 +231,91 @@ func unmarshalRow(dst []Row, s string, tagsPool []Tag, noEscapes bool, errLogger
var invalidLines = metrics.NewCounter(`vm_rows_invalid_total{type="prometheus"}`)
func unmarshalTags(dst []Tag, s string, noEscapes bool) (string, []Tag, error) {
// unmarshalQuotedString parses quoted string tags
// prometheus added support of utf-8 encoding for the text exposition format
// https://github.com/prometheus/proposals/blob/main/proposals/2023-08-21-utf8.md#syntax-examples
func unmarshalQuotedString(s string, noEscapes bool) (string, string, error) {
if len(s) == 0 || s[0] != '"' {
return "", s, fmt.Errorf("missing starting double quote in string: %q", s)
}
var n int
if noEscapes {
n = strings.IndexByte(s[1:], '"')
if n == -1 {
return "", s, fmt.Errorf("missing closing double quote in string: %q", s)
}
// Add 2 to account for both quotes
return s[1 : n+1], s[n+2:], nil
}
n = findClosingQuote(s)
if n == -1 {
return "", s, fmt.Errorf("missing closing double quote in string: %q", s)
}
return unescapeValue(s[1:n]), s[n+1:], nil
}
func (r *Row) unmarshalTags(dst []Tag, s string, noEscapes bool) (string, []Tag, error) {
var err error
for {
s = skipLeadingWhitespace(s)
if len(s) > 0 && s[0] == '}' {
// End of tags found.
return s[1:], dst, nil
}
n := strings.IndexByte(s, '=')
n := strings.IndexByte(s, '"')
if n < 0 {
// end of tags
if len(s) > 0 && s[0] == '}' {
return s[1:], dst, nil
}
return s, dst, fmt.Errorf("missing value for tag %q", s)
}
key := skipTrailingWhitespace(s[:n])
if strings.IndexByte(key, '"') >= 0 {
return s, dst, fmt.Errorf("tag key %q cannot contain double quotes", key)
}
s = skipLeadingWhitespace(s[n+1:])
if len(s) == 0 || s[0] != '"' {
return s, dst, fmt.Errorf("expecting quoted value for tag %q; got %q", key, s)
}
value := s[1:]
if noEscapes {
// Fast path - the line has no escape chars
n = strings.IndexByte(value, '"')
if n < 0 {
return s, dst, fmt.Errorf("missing closing quote for tag value %q", s)
// Determine if this is a value or quoted label
possibleKey := skipTrailingWhitespace(s[:n])
possibleKeyLen := len(possibleKey)
key := ""
if possibleKeyLen == 0 {
// Parse quoted label - {"label"="value"} or {"metric"}
key, s, err = unmarshalQuotedString(s, noEscapes)
if err != nil {
return s, dst, err
}
s = value[n+1:]
value = value[:n]
s = skipLeadingWhitespace(s)
if len(s) > 0 {
if s[0] == ',' || s[0] == '}' {
// quoted metric name {"metric_name"}
if r.Metric != "" {
return s, dst, fmt.Errorf("metric name %q already set, duplicate metric name %q", r.Metric, key)
}
r.Metric = key
if len(s) > 1 && s[0] == ',' {
s = s[1:]
}
continue
} else if s[0] != '=' {
// We are a quoted label that isn't preceded by a comma or at the end
// of the tags so we must have a value
return s, dst, fmt.Errorf("missing value for quoted tag %q", key)
}
s = skipLeadingWhitespace(s[1:])
}
// Fall through to parsing value
} else {
// Slow path - the line contains escape chars
n = findClosingQuote(s)
if n < 0 {
return s, dst, fmt.Errorf("missing closing quote for tag value %q", s)
c := possibleKey[len(possibleKey)-1]
// unquoted label {label="value"}
if c == '=' {
// Parse unquoted label
key = skipLeadingWhitespace(s[:possibleKeyLen-1])
key = skipTrailingWhitespace(key)
s = skipLeadingWhitespace(s[possibleKeyLen:])
} else {
// unquoted tag without a value
return s, dst, fmt.Errorf("missing value for unquoted tag %q", s)
}
value = unescapeValue(s[1:n])
s = s[n+1:]
}
// Parse value
var value string
value, s, err = unmarshalQuotedString(s, noEscapes)
if err != nil {
return s, dst, err
}
if len(key) > 0 {
// Allow empty values (len(value)==0) - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/453
if cap(dst) > len(dst) {

View File

@@ -221,6 +221,12 @@ func TestRowsUnmarshalFailure(t *testing.T) {
f(`a{"__name__":"upsd_time_left_ns","host":"myhost", status_OB="true"} 12`)
f(`a{host:"myhost"} 12`)
f(`a{host:"myhost",foo="bar"} 12`)
// invalid quoted UTF8 tags
f(`metric_"name"{"foo"="bar"}`)
f(`"metric_name"{"name":"name}`)
f(`metric_"name{"name":"name"}`)
f(`metric{"foo":"bar"}`)
f(`{"foo":"bar", "metric"}`)
// empty metric name
f(`{foo="bar"}`)
@@ -242,6 +248,13 @@ func TestRowsUnmarshalFailure(t *testing.T) {
// Invalid timestamp
f("foo 123 bar")
// metric name defined multiple time
f(`{"foo", "foo2", bar="baz"} 1 2`)
f(`foobar{"foo", bar="baz"} 1 2`)
// missing closing quotes on key
f(`{"a", "b = "c"}`)
// empty metric name
f(`{"a"="ok"} 1`)
}
func TestRowsUnmarshalSuccess(t *testing.T) {
@@ -466,6 +479,77 @@ cassandra_token_ownership_ratio 78.9`, &Rows{
Timestamp: 2000,
}},
})
// UTF8 Quoted tags
f(`foo{"bar"="baz"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Tags: []Tag{{
Key: "bar",
Value: "baz",
}},
Value: 1,
Timestamp: 2000,
}},
})
f(`{"foo", "bar"="baz"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Tags: []Tag{{
Key: "bar",
Value: "baz",
}},
Value: 1,
Timestamp: 2000,
}},
})
f(`{"foo", "bar"="baf\"y"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Tags: []Tag{{
Key: "bar",
Value: `baf"y`,
}},
Value: 1,
Timestamp: 2000,
}},
})
f(`{bar="baz", "foo"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Tags: []Tag{{
Key: "bar",
Value: "baz",
}},
Value: 1,
Timestamp: 2000,
}},
})
f(`{"foo"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Value: 1,
Timestamp: 2000,
}},
})
// Special character quoted UTF8 tests
f(`{"温度{房间"} 1 2`, &Rows{
Rows: []Row{{
Metric: "温度{房间",
Value: 1,
Timestamp: 2000,
}},
})
f(`{"foo", "温度{房间=\"水电费"="baz"} 1 2`, &Rows{
Rows: []Row{{
Metric: "foo",
Tags: []Tag{{
Key: `温度{房间="水电费`,
Value: "baz",
}},
Value: 1,
Timestamp: 2000,
}},
})
f(`foo{bar="b\"a\\z"} -1.2`, &Rows{
Rows: []Row{{
Metric: "foo",