diff --git a/test/e2e/data/infrastructure-docker/cluster-template-md-remediation.yaml b/test/e2e/data/infrastructure-docker/cluster-template-md-remediation.yaml index e54be632..b2bf8e7e 100644 --- a/test/e2e/data/infrastructure-docker/cluster-template-md-remediation.yaml +++ b/test/e2e/data/infrastructure-docker/cluster-template-md-remediation.yaml @@ -57,7 +57,7 @@ metadata: spec: template: spec: - customImage: k8s-snap:dev-new + customImage: k8s-snap:dev-old --- apiVersion: cluster.x-k8s.io/v1beta1 kind: MachineDeployment @@ -101,7 +101,7 @@ metadata: spec: template: spec: - customImage: k8s-snap:dev-new + customImage: k8s-snap:dev-old --- apiVersion: bootstrap.cluster.x-k8s.io/v1beta2 kind: CK8sConfigTemplate diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go index 3f8ebb8c..39dcbfe3 100644 --- a/test/e2e/helpers.go +++ b/test/e2e/helpers.go @@ -661,14 +661,17 @@ func WaitForNodesReady(ctx context.Context, input WaitForNodesReadyInput) { } nodeReadyCount := 0 for _, node := range nodeList.Items { + fmt.Fprintf(GinkgoWriter, "versions: %s %s\n", semver.MajorMinor(node.Status.NodeInfo.KubeletVersion), semver.MajorMinor(input.KubernetesVersion)) if !(semver.MajorMinor(node.Status.NodeInfo.KubeletVersion) == semver.MajorMinor(input.KubernetesVersion)) { return false, nil } + fmt.Fprintf(GinkgoWriter, "node %s is ready: %t\n", node.Name, noderefutil.IsNodeReady(&node)) if !noderefutil.IsNodeReady(&node) { return false, nil } nodeReadyCount++ } + fmt.Fprintf(GinkgoWriter, "nodeReadyCount: %d, expected count: %d\n", nodeReadyCount, input.Count) return input.Count == nodeReadyCount, nil }, input.WaitForNodesReady...).Should(BeTrue()) } diff --git a/test/e2e/md_remediation_test.go b/test/e2e/md_remediation_test.go index 4f707ba2..1390f51c 100644 --- a/test/e2e/md_remediation_test.go +++ b/test/e2e/md_remediation_test.go @@ -23,6 +23,7 @@ import ( "context" "fmt" "path/filepath" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -31,6 +32,10 @@ import ( "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/clusterctl" "sigs.k8s.io/cluster-api/util" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ) var _ = Describe("When testing MachineDeployment remediation", func() { @@ -120,3 +125,111 @@ var _ = Describe("When testing MachineDeployment remediation", func() { }) }) }) + +// DiscoverMachineHealthChecksAndWaitForRemediation patches an unhealthy node condition to one node observed by the Machine Health Check and then wait for remediation. +func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input framework.DiscoverMachineHealthCheckAndWaitForRemediationInput) { + Expect(ctx).NotTo(BeNil(), "ctx is required for DiscoverMachineHealthChecksAndWaitForRemediation") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation") + + mgmtClient := input.ClusterProxy.GetClient() + fmt.Fprintln(GinkgoWriter, "Discovering machine health check resources") + machineHealthChecks := framework.GetMachineHealthChecksForCluster(ctx, framework.GetMachineHealthChecksForClusterInput{ + Lister: mgmtClient, + ClusterName: input.Cluster.Name, + Namespace: input.Cluster.Namespace, + }) + + Expect(machineHealthChecks).NotTo(BeEmpty()) + + for _, mhc := range machineHealthChecks { + Expect(mhc.Spec.UnhealthyConditions).NotTo(BeEmpty()) + + fmt.Fprintln(GinkgoWriter, "Ensuring there is at least 1 Machine that MachineHealthCheck is matching") + machines := framework.GetMachinesByMachineHealthCheck(ctx, framework.GetMachinesByMachineHealthCheckInput{ + Lister: mgmtClient, + ClusterName: input.Cluster.Name, + MachineHealthCheck: mhc, + }) + + Expect(machines).NotTo(BeEmpty()) + + fmt.Fprintln(GinkgoWriter, "Patching MachineHealthCheck unhealthy condition to one of the nodes") + unhealthyNodeCondition := corev1.NodeCondition{ + Type: mhc.Spec.UnhealthyConditions[0].Type, + Status: mhc.Spec.UnhealthyConditions[0].Status, + LastTransitionTime: metav1.Time{Time: time.Now()}, + } + framework.PatchNodeCondition(ctx, framework.PatchNodeConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + NodeCondition: unhealthyNodeCondition, + Machine: machines[0], + }) + + fmt.Fprintln(GinkgoWriter, "Waiting for remediation x") + framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx, framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + MachineHealthCheck: mhc, + MachinesCount: len(machines), + }, input.WaitForMachineRemediation...) + } +} + +// WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition patches a node condition to any one of the machines with a node ref. +func WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx context.Context, input framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput, intervals ...interface{}) { + Expect(ctx).NotTo(BeNil(), "ctx is required for WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.MachineHealthCheck).NotTo(BeNil(), "Invalid argument. input.MachineHealthCheck can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + Expect(input.MachinesCount).NotTo(BeZero(), "Invalid argument. input.MachinesCount can't be zero when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition") + + fmt.Fprintln(GinkgoWriter, "Waiting until the node with unhealthy node condition is remediated") + Eventually(func() bool { + machines := framework.GetMachinesByMachineHealthCheck(ctx, framework.GetMachinesByMachineHealthCheckInput{ + Lister: input.ClusterProxy.GetClient(), + ClusterName: input.Cluster.Name, + MachineHealthCheck: input.MachineHealthCheck, + }) + // Wait for all the machines to exist. + // NOTE: this is required given that this helper is called after a remediation + // and we want to make sure all the machine are back in place before testing for unhealthyCondition being fixed. + fmt.Fprintf(GinkgoWriter, "waiting for all machines to exist, current count: %d, expected count: %d\n", len(machines), input.MachinesCount) + if len(machines) < input.MachinesCount { + return false + } + + for _, machine := range machines { + if machine.Status.NodeRef == nil { + fmt.Fprintf(GinkgoWriter, "machine %s no node ref", machine.Name) + return false + } + node := &corev1.Node{} + // This should not be an Expect(), because it may return error during machine deletion. + err := input.ClusterProxy.GetWorkloadCluster(ctx, input.Cluster.Namespace, input.Cluster.Name).GetClient().Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name, Namespace: machine.Status.NodeRef.Namespace}, node) + if err != nil { + fmt.Fprintf(GinkgoWriter, "failed to get node from ref: %v", err) + return false + } + if hasMatchingUnhealthyConditions(input.MachineHealthCheck, node.Status.Conditions) { + fmt.Fprintf(GinkgoWriter, "%s has not matching unhealthy condiditon", machine.Name) + return false + } + } + return true + }, intervals...).Should(BeTrue()) +} + +// hasMatchingUnhealthyConditions returns true if any node condition matches with machine health check unhealthy conditions. +func hasMatchingUnhealthyConditions(machineHealthCheck *clusterv1.MachineHealthCheck, nodeConditions []corev1.NodeCondition) bool { + fmt.Fprintf(GinkgoWriter, "checking for matching unhealthy conditions, machine health check: %v, node conditions: %v\n", machineHealthCheck.Spec.UnhealthyConditions, nodeConditions) + for _, unhealthyCondition := range machineHealthCheck.Spec.UnhealthyConditions { + for _, nodeCondition := range nodeConditions { + if nodeCondition.Type == unhealthyCondition.Type && nodeCondition.Status == unhealthyCondition.Status { + return true + } + } + } + return false +}