From 6b15ca19cd1291b8a245d72d5153827945cad037 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Thu, 19 Sep 2024 21:23:16 +0400 Subject: [PATCH] fix: audit and fix cgroup reservations Fixes: #7081 Review all reservations and limits set, test under stress load (using both memory and CPU). The goal: system components (Talos itself) and runtime (kubelet, CRI) should survive under extreme resource starvation (workloads consuming all CPU/memory). Uses #9337 to visualize changes, but doesn't depend on it. Signed-off-by: Andrey Smirnov --- go.mod | 1 + go.sum | 2 + .../pkg/controllers/k8s/kubelet_spec.go | 28 ++++++- .../pkg/controllers/k8s/kubelet_spec_test.go | 55 +++++++++++-- .../v1alpha1/v1alpha1_sequencer_tasks.go | 76 ++++++++++++++++-- .../pkg/system/runner/containerd/opts.go | 12 --- .../app/machined/pkg/system/runner/runner.go | 20 +++++ .../app/machined/pkg/system/services/apid.go | 2 + .../machined/pkg/system/services/dashboard.go | 2 + .../app/machined/pkg/system/services/etcd.go | 3 + .../machined/pkg/system/services/trustd.go | 7 +- internal/pkg/cgroup/cpu.go | 52 ++++++++++++ internal/pkg/cgroup/cpu_test.go | 38 +++++++++ internal/pkg/mount/cgroups.go | 2 +- pkg/machinery/constants/constants.go | 79 ++++++++++++++++--- 15 files changed, 337 insertions(+), 42 deletions(-) create mode 100644 internal/pkg/cgroup/cpu.go create mode 100644 internal/pkg/cgroup/cpu_test.go diff --git a/go.mod b/go.mod index 01bb4c267f..13309dd20b 100644 --- a/go.mod +++ b/go.mod @@ -80,6 +80,7 @@ require ( github.com/gizak/termui/v3 v3.1.0 github.com/godbus/dbus/v5 v5.1.0 github.com/golang/mock v1.6.0 + github.com/google/cadvisor v0.50.0 github.com/google/go-containerregistry v0.20.2 github.com/google/go-tpm v0.9.1 github.com/google/nftables v0.2.0 diff --git a/go.sum b/go.sum index 219a65e494..e5fcad12dc 100644 --- a/go.sum +++ b/go.sum @@ -286,6 +286,8 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6 github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/cadvisor v0.50.0 h1:7w/hKIbJKBWqQsRTy+Hpj2vj+fnxrLXcEXFy+LW0Bsg= +github.com/google/cadvisor v0.50.0/go.mod h1:VxCDwZalpFyENvmfabFqaIGsqNKLtDzE62a19rfVTB8= github.com/google/cel-go v0.21.0 h1:cl6uW/gxN+Hy50tNYvI691+sXxioCnstFzLp2WO4GCI= github.com/google/cel-go v0.21.0/go.mod h1:rHUlWCcBKgyEk+eV03RPdZUekPp6YcJwV0FxuUksYxc= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= diff --git a/internal/app/machined/pkg/controllers/k8s/kubelet_spec.go b/internal/app/machined/pkg/controllers/k8s/kubelet_spec.go index ddb52d6b7e..d84d9760e5 100644 --- a/internal/app/machined/pkg/controllers/k8s/kubelet_spec.go +++ b/internal/app/machined/pkg/controllers/k8s/kubelet_spec.go @@ -27,8 +27,10 @@ import ( v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/internal/pkg/cgroup" "github.com/siderolabs/talos/pkg/argsbuilder" + "github.com/siderolabs/talos/pkg/machinery/config/machine" "github.com/siderolabs/talos/pkg/machinery/constants" "github.com/siderolabs/talos/pkg/machinery/kubelet" + "github.com/siderolabs/talos/pkg/machinery/resources/config" "github.com/siderolabs/talos/pkg/machinery/resources/k8s" ) @@ -63,6 +65,12 @@ func (ctrl *KubeletSpecController) Inputs() []controller.Input { ID: optional.Some(k8s.KubeletID), Kind: controller.InputWeak, }, + { + Namespace: config.NamespaceName, + Type: config.MachineTypeType, + ID: optional.Some(config.MachineTypeID), + Kind: controller.InputWeak, + }, } } @@ -100,6 +108,15 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime kubeletVersion := compatibility.VersionFromImageRef(cfgSpec.Image) + machineType, err := safe.ReaderGetByID[*config.MachineType](ctx, r, config.MachineTypeID) + if err != nil { + if state.IsNotFoundError(err) { + continue + } + + return fmt.Errorf("error getting machine type: %w", err) + } + nodename, err := safe.ReaderGetByID[*k8s.Nodename](ctx, r, k8s.NodenameID) if err != nil { if state.IsNotFoundError(err) { @@ -173,7 +190,7 @@ func (ctrl *KubeletSpecController) Run(ctx context.Context, r controller.Runtime args["image-credential-provider-config"] = constants.KubeletCredentialProviderConfig } - kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion) + kubeletConfig, err := NewKubeletConfiguration(cfgSpec, kubeletVersion, machineType.MachineType()) if err != nil { return fmt.Errorf("error creating kubelet configuration: %w", err) } @@ -242,7 +259,7 @@ func prepareExtraConfig(extraConfig map[string]any) (*kubeletconfig.KubeletConfi // NewKubeletConfiguration builds kubelet configuration with defaults and overrides from extraConfig. // //nolint:gocyclo,cyclop -func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version) (*kubeletconfig.KubeletConfiguration, error) { +func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion compatibility.Version, machineType machine.Type) (*kubeletconfig.KubeletConfiguration, error) { config, err := prepareExtraConfig(cfgSpec.ExtraConfig) if err != nil { return nil, err @@ -333,10 +350,15 @@ func NewKubeletConfiguration(cfgSpec *k8s.KubeletConfigSpec, kubeletVersion comp if len(config.SystemReserved) == 0 { config.SystemReserved = map[string]string{ "cpu": constants.KubeletSystemReservedCPU, - "memory": constants.KubeletSystemReservedMemory, "pid": constants.KubeletSystemReservedPid, "ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage, } + + if machineType.IsControlPlane() { + config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane + } else { + config.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryWorker + } } if config.Logging.Format == "" { diff --git a/internal/app/machined/pkg/controllers/k8s/kubelet_spec_test.go b/internal/app/machined/pkg/controllers/k8s/kubelet_spec_test.go index 304dc14b10..d8b1d053d3 100644 --- a/internal/app/machined/pkg/controllers/k8s/kubelet_spec_test.go +++ b/internal/app/machined/pkg/controllers/k8s/kubelet_spec_test.go @@ -25,7 +25,9 @@ import ( "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/ctest" k8sctrl "github.com/siderolabs/talos/internal/app/machined/pkg/controllers/k8s" + "github.com/siderolabs/talos/pkg/machinery/config/machine" "github.com/siderolabs/talos/pkg/machinery/constants" + "github.com/siderolabs/talos/pkg/machinery/resources/config" "github.com/siderolabs/talos/pkg/machinery/resources/k8s" ) @@ -60,6 +62,10 @@ func (suite *KubeletSpecSuite) TestReconcileDefault() { suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename)) + machineType := config.NewMachineType() + machineType.SetMachineType(machine.TypeWorker) + suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType)) + rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) { spec := kubeletSpec.TypedSpec() @@ -97,6 +103,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() { suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename)) + machineType := config.NewMachineType() + machineType.SetMachineType(machine.TypeWorker) + suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType)) + rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) { spec := kubeletSpec.TypedSpec() @@ -114,7 +124,7 @@ func (suite *KubeletSpecSuite) TestReconcileWithExplicitNodeIP() { }) } -func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() { +func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEndpointFlag() { cfg := k8s.NewKubeletConfig(k8s.NamespaceName, k8s.KubeletID) cfg.TypedSpec().Image = "kubelet:v1.25.0" cfg.TypedSpec().ClusterDNS = []string{"10.96.0.10"} @@ -128,6 +138,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithContainerRuntimeEnpointFlag() { suite.Require().NoError(suite.State().Create(suite.Ctx(), nodename)) + machineType := config.NewMachineType() + machineType.SetMachineType(machine.TypeWorker) + suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType)) + rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) { spec := kubeletSpec.TypedSpec() @@ -180,6 +194,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithExtraConfig() { suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP)) + machineType := config.NewMachineType() + machineType.SetMachineType(machine.TypeWorker) + suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType)) + rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) { spec := kubeletSpec.TypedSpec() @@ -219,6 +237,10 @@ func (suite *KubeletSpecSuite) TestReconcileWithSkipNodeRegistration() { suite.Require().NoError(suite.State().Create(suite.Ctx(), nodeIP)) + machineType := config.NewMachineType() + machineType.SetMachineType(machine.TypeWorker) + suite.Require().NoError(suite.State().Create(suite.Ctx(), machineType)) + rtestutils.AssertResources(suite.Ctx(), suite.T(), suite.State(), []resource.ID{k8s.KubeletID}, func(kubeletSpec *k8s.KubeletSpec, asrt *assert.Assertions) { spec := kubeletSpec.TypedSpec() @@ -307,7 +329,7 @@ func TestNewKubeletConfigurationFail(t *testing.T) { tt.name, func(t *testing.T) { t.Parallel() - _, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef("")) + _, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, compatibility.VersionFromImageRef(""), machine.TypeWorker) require.Error(t, err) assert.EqualError(t, err, tt.expectedErr) @@ -352,7 +374,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { FailSwapOn: pointer.To(false), SystemReserved: map[string]string{ "cpu": constants.KubeletSystemReservedCPU, - "memory": constants.KubeletSystemReservedMemory, + "memory": constants.KubeletSystemReservedMemoryWorker, "pid": constants.KubeletSystemReservedPid, "ephemeral-storage": constants.KubeletSystemReservedEphemeralStorage, }, @@ -373,6 +395,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { cfgSpec *k8s.KubeletConfigSpec kubeletVersion compatibility.Version expectedOverrides func(*kubeletconfig.KubeletConfiguration) + machineType machine.Type }{ { name: "override some", @@ -389,6 +412,19 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { kc.OOMScoreAdj = pointer.To[int32](-300) kc.EnableDebuggingHandlers = pointer.To(true) }, + machineType: machine.TypeWorker, + }, + { + name: "controlplane", + cfgSpec: &k8s.KubeletConfigSpec{ + ClusterDNS: []string{"10.0.0.5"}, + ClusterDomain: "cluster.local", + }, + kubeletVersion: compatibility.VersionFromImageRef("ghcr.io/siderolabs/kubelet:v1.29.0"), + expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) { + kc.SystemReserved["memory"] = constants.KubeletSystemReservedMemoryControlPlane + }, + machineType: machine.TypeControlPlane, }, { name: "disable graceful shutdown", @@ -405,6 +441,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { kc.ShutdownGracePeriod = metav1.Duration{} kc.ShutdownGracePeriodCriticalPods = metav1.Duration{} }, + machineType: machine.TypeWorker, }, { name: "enable seccomp default", @@ -417,6 +454,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) { kc.SeccompDefault = pointer.To(true) }, + machineType: machine.TypeWorker, }, { name: "enable skipNodeRegistration", @@ -430,6 +468,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { kc.Authentication.Webhook.Enabled = pointer.To(false) kc.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow }, + machineType: machine.TypeWorker, }, { name: "disable manifests directory", @@ -442,6 +481,7 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { expectedOverrides: func(kc *kubeletconfig.KubeletConfiguration) { kc.StaticPodPath = "" }, + machineType: machine.TypeWorker, }, { name: "enable local FS quota monitoring", @@ -456,19 +496,20 @@ func TestNewKubeletConfigurationMerge(t *testing.T) { "LocalStorageCapacityIsolationFSQuotaMonitoring": true, } }, + machineType: machine.TypeWorker, }, } { t.Run(tt.name, func(t *testing.T) { t.Parallel() - expected := defaultKubeletConfig - tt.expectedOverrides(&expected) + expected := defaultKubeletConfig.DeepCopy() + tt.expectedOverrides(expected) - config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion) + config, err := k8sctrl.NewKubeletConfiguration(tt.cfgSpec, tt.kubeletVersion, tt.machineType) require.NoError(t, err) - assert.Equal(t, &expected, config) + assert.Equal(t, expected, config) }) } } diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index e7986a006f..87412d3b0d 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupInitReservedMemory), Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupInitMillicores))), + }, }, }, { @@ -191,15 +194,42 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupSystemReservedMemory), Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemMillicores))), + }, }, }, { - name: constants.CgroupSystemRuntime, - resources: &cgroup2.Resources{}, + name: constants.CgroupSystemRuntime, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory), + Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupSystemRuntimeMillicores))), + }, + }, }, { - name: constants.CgroupUdevd, - resources: &cgroup2.Resources{}, + name: constants.CgroupUdevd, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupUdevdReservedMemory), + Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupUdevdMillicores))), + }, + }, + }, + { + name: constants.CgroupPodRuntimeRoot, + resources: &cgroup2.Resources{ + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeRootMillicores))), + }, + }, }, { name: constants.CgroupPodRuntime, @@ -208,6 +238,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory), Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupPodRuntimeMillicores))), + }, }, }, { @@ -217,14 +250,45 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupKubeletReservedMemory), Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupKubeletMillicores))), + }, }, }, { name: constants.CgroupDashboard, resources: &cgroup2.Resources{ Memory: &cgroup2.Memory{ - Min: pointer.To[int64](constants.CgroupDashboardReservedMemory), - Low: pointer.To[int64](constants.CgroupDashboardLowMemory), + Max: pointer.To[int64](constants.CgroupDashboardMaxMemory), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupDashboardMillicores))), + }, + }, + }, + { + name: constants.CgroupApid, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupApidReservedMemory), + Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2), + Max: pointer.To[int64](constants.CgroupApidMaxMemory), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupApidMillicores))), + }, + }, + }, + { + name: constants.CgroupTrustd, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupTrustdReservedMemory), + Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2), + Max: pointer.To[int64](constants.CgroupTrustdMaxMemory), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](cgroup.MillicoresToCPUWeight(cgroup.MilliCores(constants.CgroupTrustdMillicores))), }, }, }, diff --git a/internal/app/machined/pkg/system/runner/containerd/opts.go b/internal/app/machined/pkg/system/runner/containerd/opts.go index b0f36efe48..ef466f1623 100644 --- a/internal/app/machined/pkg/system/runner/containerd/opts.go +++ b/internal/app/machined/pkg/system/runner/containerd/opts.go @@ -13,18 +13,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" ) -// WithMemoryLimit sets the linux resource memory limit field. -func WithMemoryLimit(limit int64) oci.SpecOpts { - return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { - s.Linux.Resources.Memory = &specs.LinuxMemory{ - Limit: &limit, - // DisableOOMKiller: &disable, - } - - return nil - } -} - // WithRootfsPropagation sets the root filesystem propagation. func WithRootfsPropagation(rp string) oci.SpecOpts { return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { diff --git a/internal/app/machined/pkg/system/runner/runner.go b/internal/app/machined/pkg/system/runner/runner.go index a9e6735c42..96eed7e3ba 100644 --- a/internal/app/machined/pkg/system/runner/runner.go +++ b/internal/app/machined/pkg/system/runner/runner.go @@ -6,15 +6,18 @@ package runner import ( + "context" "fmt" "io" "time" containerd "github.com/containerd/containerd/v2/client" + ocicontainers "github.com/containerd/containerd/v2/core/containers" "github.com/containerd/containerd/v2/pkg/oci" "github.com/opencontainers/runtime-spec/specs-go" "github.com/siderolabs/gen/maps" "github.com/siderolabs/gen/optional" + "github.com/siderolabs/go-pointer" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/logging" @@ -220,3 +223,20 @@ func WithUID(uid uint32) Option { args.UID = uid } } + +// WithMemoryReservation sets the memory reservation limit as on OCI spec. +func WithMemoryReservation(limit uint64) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *ocicontainers.Container, s *oci.Spec) error { + if s.Linux.Resources == nil { + s.Linux.Resources = &specs.LinuxResources{} + } + + if s.Linux.Resources.Memory == nil { + s.Linux.Resources.Memory = &specs.LinuxMemory{} + } + + s.Linux.Resources.Memory.Reservation = pointer.To(int64(limit)) + + return nil + } +} diff --git a/internal/app/machined/pkg/system/services/apid.go b/internal/app/machined/pkg/system/services/apid.go index 1f8e55547d..da7f72bf39 100644 --- a/internal/app/machined/pkg/system/services/apid.go +++ b/internal/app/machined/pkg/system/services/apid.go @@ -12,6 +12,7 @@ import ( "net" "os" "path/filepath" + "strconv" "strings" "github.com/containerd/containerd/v2/pkg/cap" @@ -164,6 +165,7 @@ func (o *APID) Runner(r runtime.Runtime) (runner.Runner, error) { env := []string{ constants.TcellMinimizeEnvironment, + "GOMEMLIMIT=" + strconv.Itoa(constants.CgroupApidMaxMemory/5*4), } for _, value := range environment.Get(r.Config()) { diff --git a/internal/app/machined/pkg/system/services/dashboard.go b/internal/app/machined/pkg/system/services/dashboard.go index 05aa6ddabb..0f5100a6d8 100644 --- a/internal/app/machined/pkg/system/services/dashboard.go +++ b/internal/app/machined/pkg/system/services/dashboard.go @@ -8,6 +8,7 @@ package services import ( "context" "fmt" + "strconv" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/internal/app/machined/pkg/system/events" @@ -61,6 +62,7 @@ func (d *Dashboard) Runner(r runtime.Runtime) (runner.Runner, error) { runner.WithEnv([]string{ "TERM=linux", constants.TcellMinimizeEnvironment, + "GOMEMLIMIT=" + strconv.Itoa(constants.CgroupDashboardMaxMemory/5*4), }), runner.WithStdinFile(tty), runner.WithStdoutFile(tty), diff --git a/internal/app/machined/pkg/system/services/etcd.go b/internal/app/machined/pkg/system/services/etcd.go index 41a8b79638..2fc8c9a486 100644 --- a/internal/app/machined/pkg/system/services/etcd.go +++ b/internal/app/machined/pkg/system/services/etcd.go @@ -36,6 +36,7 @@ import ( "github.com/siderolabs/talos/internal/app/machined/pkg/system/runner" "github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/containerd" "github.com/siderolabs/talos/internal/app/machined/pkg/system/runner/restart" + "github.com/siderolabs/talos/internal/pkg/cgroup" "github.com/siderolabs/talos/internal/pkg/containers/image" "github.com/siderolabs/talos/internal/pkg/environment" "github.com/siderolabs/talos/internal/pkg/etcd" @@ -224,6 +225,8 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) { oci.WithHostNamespace(specs.NetworkNamespace), oci.WithMounts(mounts), oci.WithUser(fmt.Sprintf("%d:%d", constants.EtcdUserID, constants.EtcdUserID)), + runner.WithMemoryReservation(constants.CgroupEtcdReservedMemory), + oci.WithCPUShares(uint64(cgroup.MilliCoresToShares(constants.CgroupEtcdMillicores))), ), runner.WithOOMScoreAdj(-998), ), diff --git a/internal/app/machined/pkg/system/services/trustd.go b/internal/app/machined/pkg/system/services/trustd.go index 7b7a47676a..36e4ab5b09 100644 --- a/internal/app/machined/pkg/system/services/trustd.go +++ b/internal/app/machined/pkg/system/services/trustd.go @@ -12,6 +12,7 @@ import ( "net" "os" "path/filepath" + "strconv" "github.com/containerd/containerd/v2/pkg/cap" "github.com/containerd/containerd/v2/pkg/oci" @@ -142,7 +143,10 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) { } env := environment.Get(r.Config()) - env = append(env, constants.TcellMinimizeEnvironment) + env = append(env, + constants.TcellMinimizeEnvironment, + "GOMEMLIMIT="+strconv.Itoa(constants.CgroupTrustdMaxMemory/5*4), + ) if debug.RaceEnabled { env = append(env, "GORACE=halt_on_error=1") @@ -156,7 +160,6 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) { runner.WithEnv(env), runner.WithCgroupPath(constants.CgroupTrustd), runner.WithOCISpecOpts( - containerd.WithMemoryLimit(int64(1000000*512)), oci.WithDroppedCapabilities(cap.Known()), oci.WithHostNamespace(specs.NetworkNamespace), oci.WithMounts(mounts), diff --git a/internal/pkg/cgroup/cpu.go b/internal/pkg/cgroup/cpu.go new file mode 100644 index 0000000000..9d3401f9fc --- /dev/null +++ b/internal/pkg/cgroup/cpu.go @@ -0,0 +1,52 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cgroup + +import ( + "runtime" + "sync" + + "github.com/google/cadvisor/utils/sysfs" + "github.com/google/cadvisor/utils/sysinfo" +) + +var availableCPUCores = sync.OnceValue(func() int { + _, cores, err := sysinfo.GetNodesInfo(sysfs.NewRealSysFs()) + if err != nil || cores < 1 { + return runtime.NumCPU() + } + + return cores +}) + +// MilliCores represents a CPU value in milli-cores. +type MilliCores uint + +// AvailableMilliCores returns the number of available CPU cores in milli-cores. +func AvailableMilliCores() MilliCores { + return MilliCores(availableCPUCores()) * 1000 +} + +// CPUShare represents a CPU share value. +type CPUShare uint64 + +// MilliCoresToShares converts milli-cores to CPU shares. +func MilliCoresToShares(milliCores MilliCores) CPUShare { + return CPUShare(milliCores) * 1024 / 1000 +} + +// SharesToCPUWeight converts CPU shares to CPU weight. +func SharesToCPUWeight(shares CPUShare) uint64 { + return uint64((((shares - 2) * 9999) / 262142) + 1) +} + +// MillicoresToCPUWeight converts milli-cores to CPU weight. +// +// It limits millicores to available CPU cores. +func MillicoresToCPUWeight(requested MilliCores) uint64 { + requested = min(requested, AvailableMilliCores()) + + return SharesToCPUWeight(MilliCoresToShares(requested)) +} diff --git a/internal/pkg/cgroup/cpu_test.go b/internal/pkg/cgroup/cpu_test.go new file mode 100644 index 0000000000..72571a06a6 --- /dev/null +++ b/internal/pkg/cgroup/cpu_test.go @@ -0,0 +1,38 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cgroup_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/siderolabs/talos/internal/pkg/cgroup" +) + +func TestAvailableMillicores(t *testing.T) { + t.Logf("Available CPU milli-cores: %d", cgroup.AvailableMilliCores()) + + assert.GreaterOrEqual(t, cgroup.AvailableMilliCores(), cgroup.MilliCores(1000)) +} + +func TestMillicoresToShares(t *testing.T) { + assert.Equal(t, cgroup.CPUShare(102), cgroup.MilliCoresToShares(100)) + assert.Equal(t, cgroup.CPUShare(1024), cgroup.MilliCoresToShares(1000)) + assert.Equal(t, cgroup.CPUShare(2560), cgroup.MilliCoresToShares(2500)) +} + +func TestSharesToCPUWeight(t *testing.T) { + assert.Equal(t, uint64(4), cgroup.SharesToCPUWeight(102)) + assert.Equal(t, uint64(79), cgroup.SharesToCPUWeight(2048)) + assert.Equal(t, uint64(313), cgroup.SharesToCPUWeight(8192)) +} + +func TestMillicoresToCPUWeight(t *testing.T) { + // depends on number of CPUs available, but for < 1000 millicores it should be same result + assert.Equal(t, uint64(4), cgroup.MillicoresToCPUWeight(100)) + assert.Equal(t, uint64(20), cgroup.MillicoresToCPUWeight(500)) + assert.Equal(t, uint64(39), cgroup.MillicoresToCPUWeight(1000)) +} diff --git a/internal/pkg/mount/cgroups.go b/internal/pkg/mount/cgroups.go index 0214ff108d..9924eea174 100644 --- a/internal/pkg/mount/cgroups.go +++ b/internal/pkg/mount/cgroups.go @@ -32,7 +32,7 @@ func CGroupMountPoints() (mountpoints *Points, err error) { func cgroupMountPointsV2() (mountpoints *Points, err error) { cgroups := NewMountPoints() - cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate")) + cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate,memory_recursiveprot")) return cgroups, nil } diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index cd13fe0799..ac1fa14534 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -402,8 +402,11 @@ const ( // KubeletSystemReservedCPU cpu system reservation value for kubelet kubeconfig. KubeletSystemReservedCPU = "50m" - // KubeletSystemReservedMemory memory system reservation value for kubelet kubeconfig. - KubeletSystemReservedMemory = "192Mi" + // KubeletSystemReservedMemoryControlPlane memory system reservation value for kubelet kubeconfig (controlplane nodes). + KubeletSystemReservedMemoryControlPlane = "512Mi" + + // KubeletSystemReservedMemoryWorker memory system reservation value for kubelet kubeconfig (worker nodes). + KubeletSystemReservedMemoryWorker = "384Mi" // KubeletSystemReservedPid pid system reservation value for kubelet kubeconfig. KubeletSystemReservedPid = "100" @@ -672,50 +675,104 @@ const ( // CgroupInitReservedMemory is the hard memory protection for the init process. CgroupInitReservedMemory = 96 * 1024 * 1024 + // CgroupInitMillicores is the CPU weight for the init process. + CgroupInitMillicores = 2000 + // CgroupSystem is the cgroup name for system processes. CgroupSystem = "/system" + // CgroupSystemMillicores is the CPU weight for the system cgroup. + CgroupSystemMillicores = 1500 + // CgroupSystemReservedMemory is the hard memory protection for the system processes. CgroupSystemReservedMemory = 96 * 1024 * 1024 // CgroupSystemRuntime is the cgroup name for containerd runtime processes. CgroupSystemRuntime = CgroupSystem + "/runtime" + // CgroupSystemRuntimeReservedMemory is the hard memory protection for the system containerd process. + CgroupSystemRuntimeReservedMemory = 48 * 1024 * 1024 + + // CgroupSystemRuntimeMillicores is the CPU weight for the system containerd process. + CgroupSystemRuntimeMillicores = 500 + // CgroupApid is the cgroup name for apid runtime processes. CgroupApid = CgroupSystem + "/apid" + // CgroupApidReservedMemory is the hard memory protection for the apid processes. + CgroupApidReservedMemory = 16 * 1024 * 1024 + + // CgroupApidMaxMemory is the hard memory limit for the apid process. + CgroupApidMaxMemory = 40 * 1024 * 1024 + + // CgroupApidMillicores is the CPU weight for the apid process. + CgroupApidMillicores = 500 + // CgroupTrustd is the cgroup name for trustd runtime processes. CgroupTrustd = CgroupSystem + "/trustd" + // CgroupTrustdReservedMemory is the hard memory protection for the trustd processes. + CgroupTrustdReservedMemory = 8 * 1024 * 1024 + + // CgroupTrustdMaxMemory is the hard memory limit for the trustd process. + CgroupTrustdMaxMemory = 24 * 1024 * 1024 + + // CgroupTrustdMillicores is the CPU weight for the trustd process. + CgroupTrustdMillicores = 250 + // CgroupUdevd is the cgroup name for udevd runtime processes. CgroupUdevd = CgroupSystem + "/udevd" + // CgroupUdevdReservedMemory is the hard memory protection for the udevd processes. + CgroupUdevdReservedMemory = 8 * 1024 * 1024 + + // CgroupUdevdMillicores is the CPU weight for the udevd process. + CgroupUdevdMillicores = 250 + // CgroupExtensions is the cgroup name for system extension processes. CgroupExtensions = CgroupSystem + "/extensions" // CgroupDashboard is the cgroup name for dashboard process. CgroupDashboard = CgroupSystem + "/dashboard" + // CgroupPodRuntimeRoot is the cgroup containing Kubernetes runtime components. + CgroupPodRuntimeRoot = "/podruntime" + + // CgroupPodRuntimeRootMillicores is the CPU weight for the pod runtime cgroup. + CgroupPodRuntimeRootMillicores = 4000 + // CgroupPodRuntime is the cgroup name for kubernetes containerd runtime processes. - CgroupPodRuntime = "/podruntime/runtime" + CgroupPodRuntime = CgroupPodRuntimeRoot + "/runtime" + + // CgroupPodRuntimeMillicores is the CPU weight for the pod runtime cgroup. + CgroupPodRuntimeMillicores = 1000 // CgroupPodRuntimeReservedMemory is the hard memory protection for the cri runtime processes. - CgroupPodRuntimeReservedMemory = 128 * 1024 * 1024 + CgroupPodRuntimeReservedMemory = 196 * 1024 * 1024 // CgroupEtcd is the cgroup name for etcd process. - CgroupEtcd = "/podruntime/etcd" + CgroupEtcd = CgroupPodRuntimeRoot + "/etcd" + + // CgroupEtcdReservedMemory is the soft memory protection for the etcd processes. + CgroupEtcdReservedMemory = 256 * 1024 * 1024 + + // CgroupEtcdMillicores is the CPU weight for the etcd process. + CgroupEtcdMillicores = 2000 // CgroupKubelet is the cgroup name for kubelet process. - CgroupKubelet = "/podruntime/kubelet" + CgroupKubelet = CgroupPodRuntimeRoot + "/kubelet" // CgroupKubeletReservedMemory is the hard memory protection for the kubelet processes. - CgroupKubeletReservedMemory = 64 * 1024 * 1024 + CgroupKubeletReservedMemory = 96 * 1024 * 1024 + + // CgroupKubeletMillicores is the CPU weight for the kubelet process. + CgroupKubeletMillicores = 1000 - // CgroupDashboardReservedMemory is the hard memory protection for the dashboard process. - CgroupDashboardReservedMemory = 85 * 1024 * 1024 + // CgroupDashboardMaxMemory is the hard memory limit for the dashboard process. + CgroupDashboardMaxMemory = 196 * 1024 * 1024 - // CgroupDashboardLowMemory is the low memory value for the dashboard process. - CgroupDashboardLowMemory = 100 * 1024 * 1024 + // CgroupDashboardMillicores is the CPU weight for the dashboard process. + CgroupDashboardMillicores = 200 // FlannelCNI is the string to use Tanos-managed Flannel CNI (default). FlannelCNI = "flannel"