mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-17 08:36:55 +03:00
lib/cgroup: support reading cpu/memory limits from systemd slices
cgroup v2 version supports slices ( aka path hierarchy) for resource limits. It's mostly supported by systemd and container runtime build on top of it. This commit reads subpath for systemd slices and traverse it with reading minimal limit value. Related docs: https://docs.oracle.com/en/operating-systems/oracle-linux/9/systemd/SystemdMngCgroupsV2.html#SlicesServicesScopesHierarchy https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10635
This commit is contained in:
@@ -26,6 +26,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
|
||||
|
||||
## tip
|
||||
|
||||
* FEATURE: all VictoriaMetrics components: add support for reading cpu/memory limits configured via [systemd slices](https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html). Previously, only limits set directly on the process's own cgroup were detected. See [#10635](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10635). Thanks to @andriibeee for the contribution.
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#vmui): now `Run query` link on the Alerting Rules page correctly propagates the alert’s interval and evaluation time. See [#10366](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10366).
|
||||
* FEATURE: [alerts](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/rules): add new `MetricNameStatsCacheUtilizationIsTooHigh` alerting rule to track overutilization of [Metric names usage stats tracker](https://docs.victoriametrics.com/victoriametrics/#track-ingested-metrics-usage) (used in [Cardinality Explorer](https://docs.victoriametrics.com/victoriametrics/#cardinality-explorer)). See [#10840](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10840).
|
||||
* FEATURE: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): add `vm_streamaggr_counter_resets_total` metric for `total*`, `increase*` and `rate*` outputs that is useful for aggregation behaviour tracking. These metrics help to identify issues described in [Troubleshooting: counter resets](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#counter-resets).
|
||||
|
||||
@@ -3,6 +3,7 @@ package cgroup
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -100,17 +101,31 @@ func getOnlineCPUCount() float64 {
|
||||
return n
|
||||
}
|
||||
|
||||
func getCPUQuotaV2(sysPrefix, cgroupPath string) (float64, error) {
|
||||
data, err := getFileContents("cpu.max", sysPrefix, cgroupPath, "")
|
||||
// See https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html
|
||||
func getCPUQuotaV2(sysfsPrefix, cgroupPath string) (float64, error) {
|
||||
subPath, err := readCgroupV2SubPath(cgroupPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
subPath = "/"
|
||||
}
|
||||
data = strings.TrimSpace(data)
|
||||
n, err := parseCPUMax(data)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse cpu.max file contents: %w", err)
|
||||
var minQuota float64 = -1
|
||||
for {
|
||||
// travers sub path hierarchy and use a minimal value for stat
|
||||
data, err := os.ReadFile(path.Join(sysfsPrefix, subPath, "cpu.max"))
|
||||
if err == nil {
|
||||
quota, err := parseCPUMax(strings.TrimSpace(string(data)))
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse cpu.max at %s: %w", subPath, err)
|
||||
}
|
||||
if quota > 0 && (minQuota < 0 || quota < minQuota) {
|
||||
minQuota = quota
|
||||
}
|
||||
}
|
||||
if subPath == "/" || subPath == "." {
|
||||
break
|
||||
}
|
||||
subPath = path.Dir(subPath)
|
||||
}
|
||||
return n, nil
|
||||
return minQuota, nil
|
||||
}
|
||||
|
||||
// See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#cpu
|
||||
|
||||
@@ -37,4 +37,7 @@ func TestGetCPUQuotaV2(t *testing.T) {
|
||||
f("testdata/cgroup", "testdata/self/cgroupv2", 2)
|
||||
f("testdata/cgroup/cpu_unset", "", -1)
|
||||
f("testdata/cgroup/cpu_onlymax", "", 2)
|
||||
|
||||
// systemd slice
|
||||
f("testdata/v2slice", "testdata/self/cgroupv2_slice", 2)
|
||||
}
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
package cgroup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GetGOGC returns GOGC value for the currently running process.
|
||||
@@ -42,15 +45,44 @@ func GetMemoryLimit() int64 {
|
||||
return n
|
||||
}
|
||||
n, err = getMemStatV2("memory.max")
|
||||
if err != nil {
|
||||
if err != nil || n <= 0 {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func getMemStatV2(statName string) (int64, error) {
|
||||
// See https: //www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
return getStatGeneric(statName, "/sys/fs/cgroup", "/proc/self/cgroup", "")
|
||||
// See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
return getMemLimitV2("/sys/fs/cgroup", "/proc/self/cgroup", statName)
|
||||
}
|
||||
|
||||
func getMemLimitV2(sysfsPrefix, cgroupPath, statName string) (int64, error) {
|
||||
subPath, err := readCgroupV2SubPath(cgroupPath)
|
||||
if err != nil {
|
||||
subPath = "/"
|
||||
}
|
||||
var minLimit int64 = -1
|
||||
for {
|
||||
// travers sub path hierarchy and use a minimal value for stat
|
||||
data, err := os.ReadFile(path.Join(sysfsPrefix, subPath, statName))
|
||||
if err == nil {
|
||||
s := strings.TrimSpace(string(data))
|
||||
if s != "max" {
|
||||
n, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("cannot parse %s at %s: %w", statName, subPath, err)
|
||||
}
|
||||
if n > 0 && (minLimit < 0 || n < minLimit) {
|
||||
minLimit = n
|
||||
}
|
||||
}
|
||||
}
|
||||
if subPath == "/" || subPath == "." {
|
||||
break
|
||||
}
|
||||
subPath = path.Dir(subPath)
|
||||
}
|
||||
return minLimit, nil
|
||||
}
|
||||
|
||||
func getMemStat(statName string) (int64, error) {
|
||||
|
||||
@@ -19,6 +19,22 @@ func TestGetHierarchicalMemoryLimitSuccess(t *testing.T) {
|
||||
f("testdata/cgroup", "testdata/self/cgroup", 120)
|
||||
}
|
||||
|
||||
func TestGetMemLimitV2(t *testing.T) {
|
||||
f := func(sysPrefix, cgroupPath string, want int64) {
|
||||
t.Helper()
|
||||
got, err := getMemLimitV2(sysPrefix, cgroupPath, "memory.max")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Fatalf("unexpected result, got: %d, want %d", got, want)
|
||||
}
|
||||
}
|
||||
f("testdata/cgroup", "testdata/self/cgroupv2", 523372036854771712)
|
||||
// systemd slice
|
||||
f("testdata/v2slice", "testdata/self/cgroupv2_slice", 1073741824)
|
||||
}
|
||||
|
||||
func TestGetHierarchicalMemoryLimitFailure(t *testing.T) {
|
||||
f := func(sysPath, cgroupPath string) {
|
||||
t.Helper()
|
||||
|
||||
1
lib/cgroup/testdata/self/cgroupv2_slice
vendored
Normal file
1
lib/cgroup/testdata/self/cgroupv2_slice
vendored
Normal file
@@ -0,0 +1 @@
|
||||
0::/vm.slice/vmagent.service
|
||||
1
lib/cgroup/testdata/v2slice/cpu.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/cpu.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
max 100000
|
||||
1
lib/cgroup/testdata/v2slice/memory.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/memory.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
max
|
||||
1
lib/cgroup/testdata/v2slice/vm.slice/cpu.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/vm.slice/cpu.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
200000 100000
|
||||
1
lib/cgroup/testdata/v2slice/vm.slice/memory.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/vm.slice/memory.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
1073741824
|
||||
1
lib/cgroup/testdata/v2slice/vm.slice/vmagent.service/cpu.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/vm.slice/vmagent.service/cpu.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
max 100000
|
||||
1
lib/cgroup/testdata/v2slice/vm.slice/vmagent.service/memory.max
vendored
Normal file
1
lib/cgroup/testdata/v2slice/vm.slice/vmagent.service/memory.max
vendored
Normal file
@@ -0,0 +1 @@
|
||||
max
|
||||
@@ -43,6 +43,18 @@ func getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// readCgroupV2SubPath reads cgroupv2 sub-path
|
||||
// for example 0::/user.slice/user-1000.slice/session-5.scope
|
||||
// See https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html
|
||||
// and https://docs.oracle.com/en/operating-systems/oracle-linux/9/systemd/SystemdMngCgroupsV2.html#SystemdScopes
|
||||
func readCgroupV2SubPath(cgroupPath string) (string, error) {
|
||||
data, err := os.ReadFile(cgroupPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return grepFirstMatch(string(data), "", 2, ":")
|
||||
}
|
||||
|
||||
// grepFirstMatch searches match line at data and returns item from it by index with given delimiter.
|
||||
func grepFirstMatch(data string, match string, index int, delimiter string) (string, error) {
|
||||
lines := strings.Split(string(data), "\n")
|
||||
|
||||
Reference in New Issue
Block a user