feat: first pass at MHC support (#77)

* feat: first pass at MHC support
k3s-io · Dec 7, 2023 · c85a596 · c85a596
1 parent 614692c
commit c85a596
Show file tree

Hide file tree

Showing 10 changed files with 763 additions and 6 deletions.
diff --git a/.golangci.yml b/.golangci.yml
@@ -70,12 +70,13 @@ linters-settings:
           - $gostd
           - github.com/go-logr/logr
           - github.com/coredns/corefile-migration/migration
+          - github.com/pkg/errors
 
           - k8s.io/api
           - k8s.io/apimachinery/pkg
           - k8s.io/apiserver
           - k8s.io/client-go
-          - k8s.io/klog/v2/klogr
+          - k8s.io/klog/v2
           - k8s.io/utils/pointer
 
           - github.com/onsi/ginkgo

diff --git a/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml b/bootstrap/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml
@@ -323,6 +323,51 @@ spec:
                   limitations. NOTE: NodeDrainTimeout is different from `kubectl drain
                   --timeout`'
                 type: string
+              remediationStrategy:
+                description: The RemediationStrategy that controls how control plane
+                  machine remediation happens.
+                properties:
+                  maxRetry:
+                    description: "MaxRetry is the Max number of retries while attempting
+                      to remediate an unhealthy machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails. For example, given a control plane with three machines
+                      M1, M2, M3: \n M1 become unhealthy; remediation happens, and
+                      M1-1 is created as a replacement. If M1-1 (replacement of M1)
+                      has problems while bootstrapping it will become unhealthy, and
+                      then be remediated; such operation is considered a retry, remediation-retry
+                      #1. If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry
+                      #2 will happen, etc. \n A retry could happen only after RetryPeriod
+                      from the previous retry. If a machine is marked as unhealthy
+                      after MinHealthyPeriod from the previous remediation expired,
+                      this is not considered a retry anymore because the new issue
+                      is assumed unrelated from the previous one. \n If not set, the
+                      remedation will be retried infinitely."
+                    format: int32
+                    type: integer
+                  minHealthyPeriod:
+                    description: "MinHealthyPeriod defines the duration after which
+                      KCP will consider any failure to a machine unrelated from the
+                      previous one. In this case the remediation is not considered
+                      a retry anymore, and thus the retry counter restarts from 0.
+                      For example, assuming MinHealthyPeriod is set to 1h (default)
+                      \n M1 become unhealthy; remediation happens, and M1-1 is created
+                      as a replacement. If M1-1 (replacement of M1) has problems within
+                      the 1hr after the creation, also this machine will be remediated
+                      and this operation is considered a retry - a problem related
+                      to the original issue happened to M1 -. \n If instead the problem
+                      on M1-1 is happening after MinHealthyPeriod expired, e.g. four
+                      days after m1-1 has been created as a remediation of M1, the
+                      problem on M1-1 is considered unrelated to the original issue
+                      happened to M1. \n If not set, this value is defaulted to 1h."
+                    type: string
+                  retryPeriod:
+                    description: "RetryPeriod is the duration that KCP should wait
+                      before remediating a machine being created as a replacement
+                      for an unhealthy machine (a retry). \n If not set, a retry will
+                      happen immediately."
+                    type: string
+                type: object
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
@@ -403,6 +448,30 @@ spec:
                 description: Initialized denotes whether or not the k3s server is
                   initialized.
                 type: boolean
+              lastRemediation:
+                description: LastRemediation stores info about last remediation performed.
+                properties:
+                  machine:
+                    description: Machine is the machine name of the latest machine
+                      being remediated.
+                    type: string
+                  retryCount:
+                    description: RetryCount used to keep track of remediation retry
+                      for the last remediated machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails.
+                    format: int32
+                    type: integer
+                  timestamp:
+                    description: Timestamp is when last remediation happened. It is
+                      represented in RFC3339 form and is in UTC.
+                    format: date-time
+                    type: string
+                required:
+                - machine
+                - retryCount
+                - timestamp
+                type: object
               observedGeneration:
                 description: ObservedGeneration is the latest generation observed
                   by the controller.

diff --git a/controlplane/api/v1beta1/kthreescontrolplane_types.go b/controlplane/api/v1beta1/kthreescontrolplane_types.go
@@ -17,6 +17,8 @@ limitations under the License.
 package v1beta1
 
 import (
+	"time"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
@@ -34,6 +36,23 @@ const (
 
 	// SkipCoreDNSAnnotation annotation explicitly skips reconciling CoreDNS if set.
 	SkipCoreDNSAnnotation = "controlplane.cluster.x-k8s.io/skip-coredns"
+
+	// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
+	// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
+	// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
+	// failures in updating remediation retry or remediation count (both counters restart from zero).
+	RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"
+
+	// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
+	// please note that in case of retry, when also the remediating machine fails, the system keeps track of
+	// the first machine of the sequence only.
+	// NOTE: if something external to CAPI removes this annotation the system this can lead to
+	// failures in updating remediation retry (the counter restarts from zero).
+	RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
+
+	// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
+	// machine unrelated from the previous remediation.
+	DefaultMinHealthyPeriod = 1 * time.Hour
 )
 
 // KThreesControlPlaneSpec defines the desired state of KThreesControlPlane.
@@ -74,6 +93,10 @@ type KThreesControlPlaneSpec struct {
 	// MachineTemplate contains information about how machines should be shaped
 	// when creating or updating a control plane.
 	MachineTemplate KThreesControlPlaneMachineTemplate `json:"machineTemplate,omitempty"`
+
+	// The RemediationStrategy that controls how control plane machine remediation happens.
+	// +optional
+	RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
 }
 
 // MachineTemplate contains information about how machines should be shaped
@@ -87,6 +110,50 @@ type KThreesControlPlaneMachineTemplate struct {
 	ObjectMeta clusterv1.ObjectMeta `json:"metadata,omitempty"`
 }
 
+// RemediationStrategy allows to define how control plane machine remediation happens.
+type RemediationStrategy struct {
+	// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	// For example, given a control plane with three machines M1, M2, M3:
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
+	//	remediated; such operation is considered a retry, remediation-retry #1.
+	//	If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry #2 will happen, etc.
+	//
+	// A retry could happen only after RetryPeriod from the previous retry.
+	// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
+	// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
+	//
+	// If not set, the remedation will be retried infinitely.
+	// +optional
+	MaxRetry *int32 `json:"maxRetry,omitempty"`
+
+	// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
+	// for an unhealthy machine (a retry).
+	//
+	// If not set, a retry will happen immediately.
+	// +optional
+	RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`
+
+	// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
+	// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
+	// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
+	//
+	//	M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
+	//	If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
+	//	this machine will be remediated and this operation is considered a retry - a problem related
+	//	to the original issue happened to M1 -.
+	//
+	//	If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
+	//	m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
+	//	the original issue happened to M1.
+	//
+	// If not set, this value is defaulted to 1h.
+	// +optional
+	MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
+}
+
 // KThreesControlPlaneStatus defines the observed state of KThreesControlPlane.
 type KThreesControlPlaneStatus struct {
 	// Selector is the label selector in string format to avoid introspection
@@ -146,6 +213,25 @@ type KThreesControlPlaneStatus struct {
 	// Conditions defines current service state of the KThreesControlPlane.
 	// +optional
 	Conditions clusterv1.Conditions `json:"conditions,omitempty"`
+
+	// LastRemediation stores info about last remediation performed.
+	// +optional
+	LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
+}
+
+// LastRemediationStatus  stores info about last remediation performed.
+// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
+// more remediations than expected might happen.
+type LastRemediationStatus struct {
+	// Machine is the machine name of the latest machine being remediated.
+	Machine string `json:"machine"`
+
+	// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
+	Timestamp metav1.Time `json:"timestamp"`
+
+	// RetryCount used to keep track of remediation retry for the last remediated machine.
+	// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
+	RetryCount int32 `json:"retryCount"`
 }
 
 // +kubebuilder:object:root=true

diff --git a/controlplane/api/v1beta1/zz_generated.deepcopy.go b/controlplane/api/v1beta1/zz_generated.deepcopy.go
diff --git a/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml b/controlplane/config/crd/bases/controlplane.cluster.x-k8s.io_kthreescontrolplanes.yaml
@@ -323,6 +323,51 @@ spec:
                   limitations. NOTE: NodeDrainTimeout is different from `kubectl drain
                   --timeout`'
                 type: string
+              remediationStrategy:
+                description: The RemediationStrategy that controls how control plane
+                  machine remediation happens.
+                properties:
+                  maxRetry:
+                    description: "MaxRetry is the Max number of retries while attempting
+                      to remediate an unhealthy machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails. For example, given a control plane with three machines
+                      M1, M2, M3: \n M1 become unhealthy; remediation happens, and
+                      M1-1 is created as a replacement. If M1-1 (replacement of M1)
+                      has problems while bootstrapping it will become unhealthy, and
+                      then be remediated; such operation is considered a retry, remediation-retry
+                      #1. If M1-2 (replacement of M1-1) becomes unhealthy, remediation-retry
+                      #2 will happen, etc. \n A retry could happen only after RetryPeriod
+                      from the previous retry. If a machine is marked as unhealthy
+                      after MinHealthyPeriod from the previous remediation expired,
+                      this is not considered a retry anymore because the new issue
+                      is assumed unrelated from the previous one. \n If not set, the
+                      remedation will be retried infinitely."
+                    format: int32
+                    type: integer
+                  minHealthyPeriod:
+                    description: "MinHealthyPeriod defines the duration after which
+                      KCP will consider any failure to a machine unrelated from the
+                      previous one. In this case the remediation is not considered
+                      a retry anymore, and thus the retry counter restarts from 0.
+                      For example, assuming MinHealthyPeriod is set to 1h (default)
+                      \n M1 become unhealthy; remediation happens, and M1-1 is created
+                      as a replacement. If M1-1 (replacement of M1) has problems within
+                      the 1hr after the creation, also this machine will be remediated
+                      and this operation is considered a retry - a problem related
+                      to the original issue happened to M1 -. \n If instead the problem
+                      on M1-1 is happening after MinHealthyPeriod expired, e.g. four
+                      days after m1-1 has been created as a remediation of M1, the
+                      problem on M1-1 is considered unrelated to the original issue
+                      happened to M1. \n If not set, this value is defaulted to 1h."
+                    type: string
+                  retryPeriod:
+                    description: "RetryPeriod is the duration that KCP should wait
+                      before remediating a machine being created as a replacement
+                      for an unhealthy machine (a retry). \n If not set, a retry will
+                      happen immediately."
+                    type: string
+                type: object
               replicas:
                 description: Number of desired machines. Defaults to 1. When stacked
                   etcd is used only odd numbers are permitted, as per [etcd best practice](https://etcd.io/docs/v3.3.12/faq/#why-an-odd-number-of-cluster-members).
@@ -403,6 +448,30 @@ spec:
                 description: Initialized denotes whether or not the k3s server is
                   initialized.
                 type: boolean
+              lastRemediation:
+                description: LastRemediation stores info about last remediation performed.
+                properties:
+                  machine:
+                    description: Machine is the machine name of the latest machine
+                      being remediated.
+                    type: string
+                  retryCount:
+                    description: RetryCount used to keep track of remediation retry
+                      for the last remediated machine. A retry happens when a machine
+                      that was created as a replacement for an unhealthy machine also
+                      fails.
+                    format: int32
+                    type: integer
+                  timestamp:
+                    description: Timestamp is when last remediation happened. It is
+                      represented in RFC3339 form and is in UTC.
+                    format: date-time
+                    type: string
+                required:
+                - machine
+                - retryCount
+                - timestamp
+                type: object
               observedGeneration:
                 description: ObservedGeneration is the latest generation observed
                   by the controller.

diff --git a/controlplane/controllers/kthreescontrolplane_controller.go b/controlplane/controllers/kthreescontrolplane_controller.go
@@ -472,9 +472,9 @@ func (r *KThreesControlPlaneReconciler) reconcile(ctx context.Context, cluster *
 
 	// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
 	// otherwise continue with the other KCP operations.
-	// if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
-	//	return result, err
-	// }
+	if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
+		return result, err
+	}
 
 	// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
 	needRollout := controlPlane.MachinesNeedingRollout()