diff --git a/.golangci.yml b/.golangci.yml index 9b75d52d..001e7119 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -70,12 +70,13 @@ linters-settings: - $gostd - github.com/go-logr/logr - github.com/coredns/corefile-migration/migration + - github.com/pkg/errors - k8s.io/api - k8s.io/apimachinery/pkg - k8s.io/apiserver - k8s.io/client-go - - k8s.io/klog/v2/klogr + - k8s.io/klog/v2 - k8s.io/utils/pointer - github.com/onsi/ginkgo diff --git a/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml b/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml index 420feccb..1bb281d7 100644 --- a/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml +++ b/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml @@ -323,6 +323,51 @@ spec: limitations. NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`' type: string + remediationStrategy: + description: The RemediationStrategy that controls how control plane + machine remediation happens. + properties: + maxRetry: + description: "MaxRetry is the Max number of retries while attempting + to remediate an unhealthy machine. A retry happens when a machine + that was created as a replacement for an unhealthy machine also + fails. For example, given a control plane with three machines + M1, M2, M3: \n M1 become unhealthy; remediation happens, and + M1-1 is created as a replacement. If M1-1 (replacement of M1) + has problems while bootstrapping it will become unhealthy, and + then be remediated; such operation is considered a retry, remediation-retry + #1. If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry + #2 will happen, etc. \n A retry could happen only after RetryPeriod + from the previous retry. If a machine is marked as unhealthy + after MinHealthyPeriod from the previous remediation expired, + this is not considered a retry anymore because the new issue + is assumed unrelated from the previous one. \n If not set, the + remedation will be retried infinitely." + format: int32 + type: integer + minHealthyPeriod: + description: "MinHealthyPeriod defines the duration after which + KCP will consider any failure to a machine unrelated from the + previous one. In this case the remediation is not considered + a retry anymore, and thus the retry counter restarts from 0. + For example, assuming MinHealthyPeriod is set to 1h (default) + \n M1 become unhealthy; remediation happens, and M1-1 is created + as a replacement. If M1-1 (replacement of M1) has problems within + the 1hr after the creation, also this machine will be remediated + and this operation is considered a retry - a problem related + to the original issue happened to M1 -. \n If instead the problem + on M1-1 is happening after MinHealthyPeriod expired, e.g. four + days after m1-1 has been created as a remediation of M1, the + problem on M1-1 is considered unrelated to the original issue + happened to M1. \n If not set, this value is defaulted to 1h." + type: string + retryPeriod: + description: "RetryPeriod is the duration that KCP should wait + before remediating a machine being created as a replacement + for an unhealthy machine (a retry). \n If not set, a retry will + happen immediately." + type: string + type: object replicas: description: Number of desired machines. Defaults to 1. When stacked etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members). @@ -403,6 +448,30 @@ spec: description: Initialized denotes whether or not the k3s server is initialized. type: boolean + lastRemediation: + description: LastRemediation stores info about last remediation performed. + properties: + machine: + description: Machine is the machine name of the latest machine + being remediated. + type: string + retryCount: + description: RetryCount used to keep track of remediation retry + for the last remediated machine. A retry happens when a machine + that was created as a replacement for an unhealthy machine also + fails. + format: int32 + type: integer + timestamp: + description: Timestamp is when last remediation happened. It is + represented in RFC3339 form and is in UTC. + format: date-time + type: string + required: + - machine + - retryCount + - timestamp + type: object observedGeneration: description: ObservedGeneration is the latest generation observed by the controller. diff --git a/controlplane/api/v1beta1/kthreescontrolplane_types.go b/controlplane/api/v1beta1/kthreescontrolplane_types.go index fc184562..95ec8537 100644 --- a/controlplane/api/v1beta1/kthreescontrolplane_types.go +++ b/controlplane/api/v1beta1/kthreescontrolplane_types.go @@ -17,6 +17,8 @@ limitations under the License. package v1beta1 import ( + "time" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -34,6 +36,23 @@ const ( // SkipCoreDNSAnnotation annotation explicitly skips reconciling CoreDNS if set. SkipCoreDNSAnnotation = "controlplane.cluster.x-k8s.io/skip-coredns" + + // RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more + // specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. + // NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to + // failures in updating remediation retry or remediation count (both counters restart from zero). + RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress" + + // RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing; + // please note that in case of retry, when also the remediating machine fails, the system keeps track of + // the first machine of the sequence only. + // NOTE: if something external to CAPI removes this annotation the system this can lead to + // failures in updating remediation retry (the counter restarts from zero). + RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for" + + // DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a + // machine unrelated from the previous remediation. + DefaultMinHealthyPeriod = 1 * time.Hour ) // KThreesControlPlaneSpec defines the desired state of KThreesControlPlane. @@ -74,6 +93,10 @@ type KThreesControlPlaneSpec struct { // MachineTemplate contains information about how machines should be shaped // when creating or updating a control plane. MachineTemplate KThreesControlPlaneMachineTemplate `json:"machineTemplate,omitempty"` + + // The RemediationStrategy that controls how control plane machine remediation happens. + // +optional + RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"` } // MachineTemplate contains information about how machines should be shaped @@ -87,6 +110,50 @@ type KThreesControlPlaneMachineTemplate struct { ObjectMeta clusterv1.ObjectMeta `json:"metadata,omitempty"` } +// RemediationStrategy allows to define how control plane machine remediation happens. +type RemediationStrategy struct { + // MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine. + // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails. + // For example, given a control plane with three machines M1, M2, M3: + // + // M1 become unhealthy; remediation happens, and M1-1 is created as a replacement. + // If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be + // remediated; such operation is considered a retry, remediation-retry #1. + // If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry #2 will happen, etc. + // + // A retry could happen only after RetryPeriod from the previous retry. + // If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired, + // this is not considered a retry anymore because the new issue is assumed unrelated from the previous one. + // + // If not set, the remedation will be retried infinitely. + // +optional + MaxRetry *int32 `json:"maxRetry,omitempty"` + + // RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement + // for an unhealthy machine (a retry). + // + // If not set, a retry will happen immediately. + // +optional + RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"` + + // MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated + // from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry + // counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default) + // + // M1 become unhealthy; remediation happens, and M1-1 is created as a replacement. + // If M1-1 (replacement of M1) has problems within the 1hr after the creation, also + // this machine will be remediated and this operation is considered a retry - a problem related + // to the original issue happened to M1 -. + // + // If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after + // m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to + // the original issue happened to M1. + // + // If not set, this value is defaulted to 1h. + // +optional + MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"` +} + // KThreesControlPlaneStatus defines the observed state of KThreesControlPlane. type KThreesControlPlaneStatus struct { // Selector is the label selector in string format to avoid introspection @@ -146,6 +213,25 @@ type KThreesControlPlaneStatus struct { // Conditions defines current service state of the KThreesControlPlane. // +optional Conditions clusterv1.Conditions `json:"conditions,omitempty"` + + // LastRemediation stores info about last remediation performed. + // +optional + LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"` +} + +// LastRemediationStatus stores info about last remediation performed. +// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus +// more remediations than expected might happen. +type LastRemediationStatus struct { + // Machine is the machine name of the latest machine being remediated. + Machine string `json:"machine"` + + // Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC. + Timestamp metav1.Time `json:"timestamp"` + + // RetryCount used to keep track of remediation retry for the last remediated machine. + // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails. + RetryCount int32 `json:"retryCount"` } // +kubebuilder:object:root=true diff --git a/controlplane/api/v1beta1/zz_generated.deepcopy.go b/controlplane/api/v1beta1/zz_generated.deepcopy.go index 9550c57e..999b0fbf 100644 --- a/controlplane/api/v1beta1/zz_generated.deepcopy.go +++ b/controlplane/api/v1beta1/zz_generated.deepcopy.go @@ -122,6 +122,11 @@ func (in *KThreesControlPlaneSpec) DeepCopyInto(out *KThreesControlPlaneSpec) { **out = **in } in.MachineTemplate.DeepCopyInto(&out.MachineTemplate) + if in.RemediationStrategy != nil { + in, out := &in.RemediationStrategy, &out.RemediationStrategy + *out = new(RemediationStrategy) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KThreesControlPlaneSpec. @@ -149,6 +154,11 @@ func (in *KThreesControlPlaneStatus) DeepCopyInto(out *KThreesControlPlaneStatus (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.LastRemediation != nil { + in, out := &in.LastRemediation, &out.LastRemediation + *out = new(LastRemediationStatus) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KThreesControlPlaneStatus. @@ -160,3 +170,45 @@ func (in *KThreesControlPlaneStatus) DeepCopy() *KThreesControlPlaneStatus { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LastRemediationStatus) DeepCopyInto(out *LastRemediationStatus) { + *out = *in + in.Timestamp.DeepCopyInto(&out.Timestamp) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LastRemediationStatus. +func (in *LastRemediationStatus) DeepCopy() *LastRemediationStatus { + if in == nil { + return nil + } + out := new(LastRemediationStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemediationStrategy) DeepCopyInto(out *RemediationStrategy) { + *out = *in + if in.MaxRetry != nil { + in, out := &in.MaxRetry, &out.MaxRetry + *out = new(int32) + **out = **in + } + out.RetryPeriod = in.RetryPeriod + if in.MinHealthyPeriod != nil { + in, out := &in.MinHealthyPeriod, &out.MinHealthyPeriod + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationStrategy. +func (in *RemediationStrategy) DeepCopy() *RemediationStrategy { + if in == nil { + return nil + } + out := new(RemediationStrategy) + in.DeepCopyInto(out) + return out +} diff --git a/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml b/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml index 420feccb..1bb281d7 100644 --- a/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml +++ b/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml @@ -323,6 +323,51 @@ spec: limitations. NOTE: NodeDrainTimeout is different from `kubectl drain --timeout`' type: string + remediationStrategy: + description: The RemediationStrategy that controls how control plane + machine remediation happens. + properties: + maxRetry: + description: "MaxRetry is the Max number of retries while attempting + to remediate an unhealthy machine. A retry happens when a machine + that was created as a replacement for an unhealthy machine also + fails. For example, given a control plane with three machines + M1, M2, M3: \n M1 become unhealthy; remediation happens, and + M1-1 is created as a replacement. If M1-1 (replacement of M1) + has problems while bootstrapping it will become unhealthy, and + then be remediated; such operation is considered a retry, remediation-retry + #1. If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry + #2 will happen, etc. \n A retry could happen only after RetryPeriod + from the previous retry. If a machine is marked as unhealthy + after MinHealthyPeriod from the previous remediation expired, + this is not considered a retry anymore because the new issue + is assumed unrelated from the previous one. \n If not set, the + remedation will be retried infinitely." + format: int32 + type: integer + minHealthyPeriod: + description: "MinHealthyPeriod defines the duration after which + KCP will consider any failure to a machine unrelated from the + previous one. In this case the remediation is not considered + a retry anymore, and thus the retry counter restarts from 0. + For example, assuming MinHealthyPeriod is set to 1h (default) + \n M1 become unhealthy; remediation happens, and M1-1 is created + as a replacement. If M1-1 (replacement of M1) has problems within + the 1hr after the creation, also this machine will be remediated + and this operation is considered a retry - a problem related + to the original issue happened to M1 -. \n If instead the problem + on M1-1 is happening after MinHealthyPeriod expired, e.g. four + days after m1-1 has been created as a remediation of M1, the + problem on M1-1 is considered unrelated to the original issue + happened to M1. \n If not set, this value is defaulted to 1h." + type: string + retryPeriod: + description: "RetryPeriod is the duration that KCP should wait + before remediating a machine being created as a replacement + for an unhealthy machine (a retry). \n If not set, a retry will + happen immediately." + type: string + type: object replicas: description: Number of desired machines. Defaults to 1. When stacked etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members). @@ -403,6 +448,30 @@ spec: description: Initialized denotes whether or not the k3s server is initialized. type: boolean + lastRemediation: + description: LastRemediation stores info about last remediation performed. + properties: + machine: + description: Machine is the machine name of the latest machine + being remediated. + type: string + retryCount: + description: RetryCount used to keep track of remediation retry + for the last remediated machine. A retry happens when a machine + that was created as a replacement for an unhealthy machine also + fails. + format: int32 + type: integer + timestamp: + description: Timestamp is when last remediation happened. It is + represented in RFC3339 form and is in UTC. + format: date-time + type: string + required: + - machine + - retryCount + - timestamp + type: object observedGeneration: description: ObservedGeneration is the latest generation observed by the controller. diff --git a/controlplane/controllers/kthreescontrolplane_controller.go b/controlplane/controllers/kthreescontrolplane_controller.go index a880a425..23724c39 100644 --- a/controlplane/controllers/kthreescontrolplane_controller.go +++ b/controlplane/controllers/kthreescontrolplane_controller.go @@ -472,9 +472,9 @@ func (r *KThreesControlPlaneReconciler) reconcile(ctx context.Context, cluster * // Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate, // otherwise continue with the other KCP operations. - // if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() { - // return result, err - // } + if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() { + return result, err + } // Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations. needRollout := controlPlane.MachinesNeedingRollout() diff --git a/controlplane/controllers/remediation.go b/controlplane/controllers/remediation.go new file mode 100644 index 00000000..b7b33bac --- /dev/null +++ b/controlplane/controllers/remediation.go @@ -0,0 +1,441 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/go-logr/logr" + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" + ctrl "sigs.k8s.io/controller-runtime" + + controlplanev1 "github.com/cluster-api-provider-k3s/cluster-api-k3s/controlplane/api/v1beta1" + k3s "github.com/cluster-api-provider-k3s/cluster-api-k3s/pkg/k3s" +) + +// reconcileUnhealthyMachines tries to remediate KThreesControlPlane unhealthy machines +// based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate +// taken from the kubeadm codebase and adapted for the k3s provider. +func (r *KThreesControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *k3s.ControlPlane) (ret ctrl.Result, retErr error) { + log := ctrl.LoggerFrom(ctx) + reconciliationTime := time.Now().UTC() + + // Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1) + // if the underlying machine is now back to healthy / not deleting. + errList := []error{} + healthyMachines := controlPlane.HealthyMachines() + for _, m := range healthyMachines { + if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && + conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && + m.DeletionTimestamp.IsZero() { + patchHelper, err := patch.NewHelper(m, r.Client) + if err != nil { + errList = append(errList, errors.Wrapf(err, "failed to get PatchHelper for machine %s", m.Name)) + continue + } + + conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition) + + if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineOwnerRemediatedCondition, + }}); err != nil { + errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", m.Name)) + } + } + } + if len(errList) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errList) + } + + // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) + // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. + unhealthyMachines := controlPlane.UnhealthyMachines() + + // If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil). + if len(unhealthyMachines) == 0 { + return ctrl.Result{}, nil + } + + // Select the machine to be remediated, which is the oldest machine marked as unhealthy. + // + // NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine), + // however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists + // by considering which machine has lower impact on etcd quorum. + machineToBeRemediated := unhealthyMachines.Oldest() + + // Returns if the machine is in the process of being deleted. + if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + log = log.WithValues("Machine", klog.KObj(machineToBeRemediated), "initialized", controlPlane.KCP.Status.Initialized) + + // Returns if another remediation is in progress but the new Machine is not yet created. + // Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated + // is being deleted to avoid unnecessary logs if no further remediation should be done. + if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { + log.Info("Another remediation is already in progress. Skipping remediation.") + return ctrl.Result{}, nil + } + + patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) + if err != nil { + return ctrl.Result{}, err + } + + defer func() { + // Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines. + if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ + clusterv1.MachineOwnerRemediatedCondition, + }}); err != nil { + log.Error(err, "Failed to patch control plane Machine", "Machine", machineToBeRemediated.Name) + if retErr == nil { + retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name) + } + } + }() + + // Before starting remediation, run preflight checks in order to verify it is safe to remediate. + // If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition. + + // Check if KCP is allowed to remediate considering retry limits: + // - Remediation cannot happen because retryPeriod is not yet expired. + // - KCP already reached MaxRetries limit. + remediationInProgressData, canRemediate, err := r.checkRetryLimits(log, machineToBeRemediated, controlPlane, reconciliationTime) + if err != nil { + return ctrl.Result{}, err + } + if !canRemediate { + // NOTE: log lines and conditions surfacing why it is not possible to remediate are set by checkRetryLimits. + return ctrl.Result{}, nil + } + + if controlPlane.KCP.Status.Initialized { + // Executes checks that apply only if the control plane is already initialized; in this case KCP can + // remediate only if it can safely assume that the operation preserves the operation state of the + // existing cluster (or at least it doesn't make it worse). + + // The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance. + if controlPlane.Machines.Len() <= 1 { + log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len()) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1") + return ctrl.Result{}, nil + } + + // The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state. + if controlPlane.HasDeletingMachine() { + log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation") + return ctrl.Result{}, nil + } + + // Remediation MUST preserve etcd quorum. This rule ensures that KCP will not remove a member that would result in etcd + // losing a majority of members and thus become unable to field new requests. + if controlPlane.IsEtcdManaged() { + canSafelyRemediate := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated) + if !canSafelyRemediate { + log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum") + return ctrl.Result{}, nil + } + } + + // Start remediating the unhealthy control plane machine by deleting it. + // A new machine will come up completing the operation as part of the regular reconcile. + + // TODO figure out etcd complexities + // If the control plane is initialized, before deleting the machine: + // - if the machine hosts the etcd leader, forward etcd leadership to another machine. + // - delete the etcd member hosted on the machine being deleted. + // - remove the etcd member from the kubeadm config map (only for kubernetes version older than v1.22.0) + /** + workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) + if err != nil { + log.Error(err, "Failed to create client to workload cluster") + return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") + } + + // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. + if controlPlane.IsEtcdManaged() { + etcdLeaderCandidate := controlPlane.HealthyMachines().Newest() + if etcdLeaderCandidate == nil { + log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, + "A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation") + return ctrl.Result{}, nil + } + if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil { + log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate)) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) + return ctrl.Result{}, err + } + if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil { + log.Error(err, "Failed to remove etcd member for machine") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) + return ctrl.Result{}, err + } + } + + parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version) + } + + if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated, parsedVersion); err != nil { + log.Error(err, "Failed to remove machine from kubeadm ConfigMap") + return ctrl.Result{}, err + } + **/ + } + + // Delete the machine + if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil { + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) + return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name) + } + + // Surface the operation is in progress. + log.Info("Remediating unhealthy machine") + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") + + // Prepare the info for tracking the remediation progress into the RemediationInProgressAnnotation. + remediationInProgressValue, err := remediationInProgressData.Marshal() + if err != nil { + return ctrl.Result{}, err + } + + // Set annotations tracking remediation details so they can be picked up by the machine + // that will be created as part of the scale up action that completes the remediation. + annotations.AddAnnotations(controlPlane.KCP, map[string]string{ + controlplanev1.RemediationInProgressAnnotation: remediationInProgressValue, + }) + + return ctrl.Result{Requeue: true}, nil +} + +// checkRetryLimits checks if KCP is allowed to remediate considering retry limits: +// - Remediation cannot happen because retryPeriod is not yet expired. +// - KCP already reached the maximum number of retries for a machine. +// NOTE: Counting the number of retries is required In order to prevent infinite remediation e.g. in case the +// first Control Plane machine is failing due to quota issue. +func (r *KThreesControlPlaneReconciler) checkRetryLimits(log logr.Logger, machineToBeRemediated *clusterv1.Machine, controlPlane *k3s.ControlPlane, reconciliationTime time.Time) (*RemediationData, bool, error) { + // Get last remediation info from the machine. + var lastRemediationData *RemediationData + if value, ok := machineToBeRemediated.Annotations[controlplanev1.RemediationForAnnotation]; ok { + l, err := RemediationDataFromAnnotation(value) + if err != nil { + return nil, false, err + } + lastRemediationData = l + } + + remediationInProgressData := &RemediationData{ + Machine: machineToBeRemediated.Name, + Timestamp: metav1.Time{Time: reconciliationTime}, + RetryCount: 0, + } + + // If there is no last remediation, this is the first try of a new retry sequence. + if lastRemediationData == nil { + return remediationInProgressData, true, nil + } + + // Gets MinHealthyPeriod and RetryPeriod from the remediation strategy, or use defaults. + minHealthyPeriod := controlplanev1.DefaultMinHealthyPeriod + if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod != nil { + minHealthyPeriod = controlPlane.KCP.Spec.RemediationStrategy.MinHealthyPeriod.Duration + } + retryPeriod := time.Duration(0) + if controlPlane.KCP.Spec.RemediationStrategy != nil { + retryPeriod = controlPlane.KCP.Spec.RemediationStrategy.RetryPeriod.Duration + } + + // Gets the timestamp of the last remediation; if missing, default to a value + // that ensures both MinHealthyPeriod and RetryPeriod are expired. + // NOTE: this could potentially lead to executing more retries than expected or to executing retries before than + // expected, but this is considered acceptable when the system recovers from someone/something changes or deletes + // the RemediationForAnnotation on Machines. + lastRemediationTime := reconciliationTime.Add(-2 * max(minHealthyPeriod, retryPeriod)) + if !lastRemediationData.Timestamp.IsZero() { + lastRemediationTime = lastRemediationData.Timestamp.Time + } + + // Once we get here we already know that there was a last remediation for the Machine. + // If the current remediation is happening before minHealthyPeriod is expired, then KCP considers this + // as a remediation for the same previously unhealthy machine. + // NOTE: If someone/something changes the RemediationForAnnotation on Machines (e.g. changes the Timestamp), + // this could potentially lead to executing more retries than expected, but this is considered acceptable in such a case. + var retryForSameMachineInProgress bool + if lastRemediationTime.Add(minHealthyPeriod).After(reconciliationTime) { + retryForSameMachineInProgress = true + log = log.WithValues("RemediationRetryFor", klog.KRef(machineToBeRemediated.Namespace, lastRemediationData.Machine)) + } + + // If the retry for the same machine is not in progress, this is the first try of a new retry sequence. + if !retryForSameMachineInProgress { + return remediationInProgressData, true, nil + } + + // If the remediation is for the same machine, carry over the retry count. + remediationInProgressData.RetryCount = lastRemediationData.RetryCount + + // Check if remediation can happen because retryPeriod is passed. + if lastRemediationTime.Add(retryPeriod).After(reconciliationTime) { + log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed in the latest %s. Skipping remediation", retryPeriod)) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest %s (RetryPeriod)", retryPeriod) + return remediationInProgressData, false, nil + } + + // Check if remediation can happen because of maxRetry is not reached yet, if defined. + if controlPlane.KCP.Spec.RemediationStrategy != nil && controlPlane.KCP.Spec.RemediationStrategy.MaxRetry != nil { + maxRetry := int(*controlPlane.KCP.Spec.RemediationStrategy.MaxRetry) + if remediationInProgressData.RetryCount >= maxRetry { + log.Info(fmt.Sprintf("A control plane machine needs remediation, but the operation already failed %d times (MaxRetry %d). Skipping remediation", remediationInProgressData.RetryCount, maxRetry)) + conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed %d times (MaxRetry)", maxRetry) + return remediationInProgressData, false, nil + } + } + + // All the check passed, increase the remediation retry count. + remediationInProgressData.RetryCount++ + + return remediationInProgressData, true, nil +} + +// max calculates the maximum duration. +func max(x, y time.Duration) time.Duration { + if x < y { + return y + } + return x +} + +// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated +// without loosing etcd quorum. +// +// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according +// to the etcd fault tolerance specification (see https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance): +// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target +// cluster size after deletion is 2, fault tolerance 0) +// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target +// cluster size after deletion is 4, fault tolerance 1) +// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target +// cluster size after deletion is 6, fault tolerance 2) +// - etc. +// +// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers +// as well as reconcileControlPlaneConditions before this. +// +// adapted from kubeadm controller and makes the assumption that the set of controplane nodes equals the set of etcd nodes. +func (r *KThreesControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *k3s.ControlPlane, machineToBeRemediated *clusterv1.Machine) bool { + log := ctrl.LoggerFrom(ctx) + + currentTotalMembers := len(controlPlane.Machines) + + log.Info("etcd cluster before remediation", + "currentTotalMembers", currentTotalMembers) + + // Projects the target etcd cluster after remediation, considering all the etcd members except the one being remediated. + targetTotalMembers := 0 + targetUnhealthyMembers := 0 + + healthyMembers := []string{} + unhealthyMembers := []string{} + for _, m := range controlPlane.Machines { + // Skip the machine to be deleted because it won't be part of the target etcd cluster. + if machineToBeRemediated.Status.NodeRef != nil && machineToBeRemediated.Status.NodeRef.Name == m.Status.NodeRef.Name { + continue + } + + // Include the member in the target etcd cluster. + targetTotalMembers++ + + // Check member health as reported by machine's health conditions + if !conditions.IsTrue(m, controlplanev1.MachineEtcdMemberHealthyCondition) { + targetUnhealthyMembers++ + unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", m.Status.NodeRef.Name, m.Name)) + continue + } + + healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", m.Status.NodeRef.Name, m.Name)) + } + + // See https://etcd.io/docs/v3.3/faq/#what-is-failure-tolerance for fault tolerance formula explanation. + targetQuorum := (targetTotalMembers / 2.0) + 1 + canSafelyRemediate := targetTotalMembers-targetUnhealthyMembers >= targetQuorum + + log.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name), + "healthyMembers", healthyMembers, + "unhealthyMembers", unhealthyMembers, + "targetTotalMembers", targetTotalMembers, + "targetQuorum", targetQuorum, + "targetUnhealthyMembers", targetUnhealthyMembers, + "canSafelyRemediate", canSafelyRemediate) + + return canSafelyRemediate +} + +// RemediationData struct is used to keep track of information stored in the RemediationInProgressAnnotation in KCP +// during remediation and then into the RemediationForAnnotation on the replacement machine once it is created. +type RemediationData struct { + // Machine is the machine name of the latest machine being remediated. + Machine string `json:"machine"` + + // Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC. + Timestamp metav1.Time `json:"timestamp"` + + // RetryCount used to keep track of remediation retry for the last remediated machine. + // A retry happens when a machine that was created as a replacement for an unhealthy machine also fails. + RetryCount int `json:"retryCount"` +} + +// RemediationDataFromAnnotation gets RemediationData from an annotation value. +func RemediationDataFromAnnotation(value string) (*RemediationData, error) { + ret := &RemediationData{} + if err := json.Unmarshal([]byte(value), ret); err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal value %s for %s annotation", value, clusterv1.RemediationInProgressReason) + } + return ret, nil +} + +// Marshal an RemediationData into an annotation value. +func (r *RemediationData) Marshal() (string, error) { + b, err := json.Marshal(r) + if err != nil { + return "", errors.Wrapf(err, "failed to marshal value for %s annotation", clusterv1.RemediationInProgressReason) + } + return string(b), nil +} + +// ToStatus converts a RemediationData into a LastRemediationStatus struct. +func (r *RemediationData) ToStatus() *controlplanev1.LastRemediationStatus { + return &controlplanev1.LastRemediationStatus{ + Machine: r.Machine, + Timestamp: r.Timestamp, + RetryCount: int32(r.RetryCount), + } +} diff --git a/go.mod b/go.mod index 8834db04..8f34690b 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( k8s.io/client-go v0.26.1 k8s.io/klog/v2 v2.80.1 k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 - sigs.k8s.io/cluster-api v1.4.3 + sigs.k8s.io/cluster-api v1.5.3 sigs.k8s.io/controller-runtime v0.14.5 sigs.k8s.io/yaml v1.3.0 ) @@ -24,6 +24,8 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect github.com/coredns/caddy v1.1.0 // indirect + github.com/coreos/go-semver v0.3.0 // indirect + github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.9.0 // indirect github.com/evanphx/json-patch v5.6.0+incompatible // indirect @@ -46,6 +48,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect + github.com/moby/spdystream v0.2.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -56,6 +59,9 @@ require ( github.com/prometheus/common v0.37.0 // indirect github.com/prometheus/procfs v0.8.0 // indirect github.com/spf13/pflag v1.0.5 // indirect + go.etcd.io/etcd/api/v3 v3.5.6 // indirect + go.etcd.io/etcd/client/pkg/v3 v3.5.6 // indirect + go.etcd.io/etcd/client/v3 v3.5.6 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.24.0 // indirect @@ -67,6 +73,8 @@ require ( golang.org/x/time v0.3.0 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/appengine v1.6.7 // indirect + google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef // indirect + google.golang.org/grpc v1.52.0 // indirect google.golang.org/protobuf v1.28.1 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect diff --git a/go.sum b/go.sum index 1f3833a0..eea44f2f 100644 --- a/go.sum +++ b/go.sum @@ -75,14 +75,19 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/coredns/caddy v1.1.0 h1:ezvsPrT/tA/7pYDBZxu0cT0VmWk75AfIaf6GSYCNMf0= github.com/coredns/caddy v1.1.0/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4= github.com/coredns/corefile-migration v1.0.20 h1:MdOkT6F3ehju/n9tgxlGct8XAajOX2vN+wG7To4BWSI= github.com/coredns/corefile-migration v1.0.20/go.mod h1:XnhgULOEouimnzgn0t4WPuFDN2/PJQcTxdWKC5eXNGE= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= +github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= +github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= +github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -92,13 +97,16 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZm github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/emicklei/go-restful/v3 v3.9.0 h1:XwGDlfxEnQZzuopoqxwSEllNcCOM9DhhFyhFIIGKwxE= github.com/emicklei/go-restful/v3 v3.9.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= +github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= +github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= @@ -143,6 +151,7 @@ github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg78 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/gobuffalo/flect v1.0.2 h1:eqjPGSo2WmjgY2XlpGwo2NXgL3RucAKo4k4qQMNA5sA= github.com/gobuffalo/flect v1.0.2/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -297,6 +306,8 @@ github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:F github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -336,6 +347,7 @@ github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDf github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= +github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= @@ -411,7 +423,14 @@ github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.etcd.io/etcd/api/v3 v3.5.6 h1:Cy2qx3npLcYqTKqGJzMypnMv2tiRyifZJ17BlWIWA7A= +go.etcd.io/etcd/api/v3 v3.5.6/go.mod h1:KFtNaxGDw4Yx/BA4iPPwevUTAuqcsPxzyX8PHydchN8= +go.etcd.io/etcd/client/pkg/v3 v3.5.6 h1:TXQWYceBKqLp4sa87rcPs11SXxUA/mHwH975v+BDvLU= +go.etcd.io/etcd/client/pkg/v3 v3.5.6/go.mod h1:ggrwbk069qxpKPq8/FKkQ3Xq9y39kbFR4LnKszpRXeQ= +go.etcd.io/etcd/client/v3 v3.5.6 h1:coLs69PWCXE9G4FKquzNaSHrRyMCAXwF+IX1tAPVO8E= +go.etcd.io/etcd/client/v3 v3.5.6/go.mod h1:f6GRinRMCsFVv9Ht42EyY7nfsVGwrNO0WEoS2pRKzQk= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -429,6 +448,7 @@ go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9i go.uber.org/multierr v1.8.0 h1:dg6GjLku4EH+249NNmoIciG9N/jURbDG+pFlTkhzIC8= go.uber.org/multierr v1.8.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= @@ -462,6 +482,7 @@ golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= @@ -470,6 +491,7 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -530,6 +552,7 @@ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -573,6 +596,7 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -648,6 +672,7 @@ golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -709,8 +734,10 @@ google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7Fc google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0= google.golang.org/genproto v0.0.0-20220107163113-42d7afdf6368/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef h1:uQ2vjV/sHTsWSqdKeLqmwitzgvjMl7o4IdtHwUDXSJY= +google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef/go.mod h1:RGgjbofJ8xD9Sq1VVhDM1Vok1vRONV+rg+CjzG4SZKM= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -725,7 +752,11 @@ google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= +google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= +google.golang.org/grpc v1.41.0/go.mod h1:U3l9uK9J0sini8mHphKoXyaqDA/8VyGnDee1zzIUK6k= +google.golang.org/grpc v1.52.0 h1:kd48UiU7EHsV4rnLyOJRuP/Il/UHE7gdDAQ+SZI7nZk= +google.golang.org/grpc v1.52.0/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5vorUY= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -808,5 +839,6 @@ sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/pkg/k3s/workload_cluster.go b/pkg/k3s/workload_cluster.go index 1ffeb3c8..c0f2d636 100644 --- a/pkg/k3s/workload_cluster.go +++ b/pkg/k3s/workload_cluster.go @@ -52,7 +52,6 @@ type Workload struct { Client ctrlclient.Client CoreDNSMigrator coreDNSMigrator - // etcdClientGenerator etcdClientFor } // ClusterStatus holds stats information about the cluster.