Skip to content

Commit

Permalink
fix remediation
Browse files Browse the repository at this point in the history
  • Loading branch information
bschimke95 committed Sep 1, 2024
1 parent b5edec7 commit 383a574
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ metadata:
spec:
template:
spec:
customImage: k8s-snap:dev-new
customImage: k8s-snap:dev-old
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
Expand Down Expand Up @@ -101,7 +101,7 @@ metadata:
spec:
template:
spec:
customImage: k8s-snap:dev-new
customImage: k8s-snap:dev-old
---
apiVersion: bootstrap.cluster.x-k8s.io/v1beta2
kind: CK8sConfigTemplate
Expand Down
3 changes: 3 additions & 0 deletions test/e2e/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -661,14 +661,17 @@ func WaitForNodesReady(ctx context.Context, input WaitForNodesReadyInput) {
}
nodeReadyCount := 0
for _, node := range nodeList.Items {
fmt.Fprintf(GinkgoWriter, "versions: %s %s\n", semver.MajorMinor(node.Status.NodeInfo.KubeletVersion), semver.MajorMinor(input.KubernetesVersion))
if !(semver.MajorMinor(node.Status.NodeInfo.KubeletVersion) == semver.MajorMinor(input.KubernetesVersion)) {
return false, nil
}
fmt.Fprintf(GinkgoWriter, "node %s is ready: %t\n", node.Name, noderefutil.IsNodeReady(&node))
if !noderefutil.IsNodeReady(&node) {
return false, nil
}
nodeReadyCount++
}
fmt.Fprintf(GinkgoWriter, "nodeReadyCount: %d, expected count: %d\n", nodeReadyCount, input.Count)
return input.Count == nodeReadyCount, nil
}, input.WaitForNodesReady...).Should(BeTrue())
}
Expand Down
113 changes: 113 additions & 0 deletions test/e2e/md_remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"context"
"fmt"
"path/filepath"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand All @@ -31,6 +32,10 @@ import (
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
"sigs.k8s.io/cluster-api/util"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
)

var _ = Describe("When testing MachineDeployment remediation", func() {
Expand Down Expand Up @@ -120,3 +125,111 @@ var _ = Describe("When testing MachineDeployment remediation", func() {
})
})
})

// DiscoverMachineHealthChecksAndWaitForRemediation patches an unhealthy node condition to one node observed by the Machine Health Check and then wait for remediation.
func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input framework.DiscoverMachineHealthCheckAndWaitForRemediationInput) {
Expect(ctx).NotTo(BeNil(), "ctx is required for DiscoverMachineHealthChecksAndWaitForRemediation")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation")
Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling DiscoverMachineHealthChecksAndWaitForRemediation")

mgmtClient := input.ClusterProxy.GetClient()
fmt.Fprintln(GinkgoWriter, "Discovering machine health check resources")
machineHealthChecks := framework.GetMachineHealthChecksForCluster(ctx, framework.GetMachineHealthChecksForClusterInput{
Lister: mgmtClient,
ClusterName: input.Cluster.Name,
Namespace: input.Cluster.Namespace,
})

Expect(machineHealthChecks).NotTo(BeEmpty())

for _, mhc := range machineHealthChecks {
Expect(mhc.Spec.UnhealthyConditions).NotTo(BeEmpty())

fmt.Fprintln(GinkgoWriter, "Ensuring there is at least 1 Machine that MachineHealthCheck is matching")
machines := framework.GetMachinesByMachineHealthCheck(ctx, framework.GetMachinesByMachineHealthCheckInput{
Lister: mgmtClient,
ClusterName: input.Cluster.Name,
MachineHealthCheck: mhc,
})

Expect(machines).NotTo(BeEmpty())

fmt.Fprintln(GinkgoWriter, "Patching MachineHealthCheck unhealthy condition to one of the nodes")
unhealthyNodeCondition := corev1.NodeCondition{
Type: mhc.Spec.UnhealthyConditions[0].Type,
Status: mhc.Spec.UnhealthyConditions[0].Status,
LastTransitionTime: metav1.Time{Time: time.Now()},
}
framework.PatchNodeCondition(ctx, framework.PatchNodeConditionInput{
ClusterProxy: input.ClusterProxy,
Cluster: input.Cluster,
NodeCondition: unhealthyNodeCondition,
Machine: machines[0],
})

fmt.Fprintln(GinkgoWriter, "Waiting for remediation x")
framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx, framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{
ClusterProxy: input.ClusterProxy,
Cluster: input.Cluster,
MachineHealthCheck: mhc,
MachinesCount: len(machines),
}, input.WaitForMachineRemediation...)
}
}

// WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition patches a node condition to any one of the machines with a node ref.
func WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx context.Context, input framework.WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput, intervals ...interface{}) {
Expect(ctx).NotTo(BeNil(), "ctx is required for WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition")
Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition")
Expect(input.MachineHealthCheck).NotTo(BeNil(), "Invalid argument. input.MachineHealthCheck can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition")
Expect(input.MachinesCount).NotTo(BeZero(), "Invalid argument. input.MachinesCount can't be zero when calling WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition")

fmt.Fprintln(GinkgoWriter, "Waiting until the node with unhealthy node condition is remediated")
Eventually(func() bool {
machines := framework.GetMachinesByMachineHealthCheck(ctx, framework.GetMachinesByMachineHealthCheckInput{
Lister: input.ClusterProxy.GetClient(),
ClusterName: input.Cluster.Name,
MachineHealthCheck: input.MachineHealthCheck,
})
// Wait for all the machines to exist.
// NOTE: this is required given that this helper is called after a remediation
// and we want to make sure all the machine are back in place before testing for unhealthyCondition being fixed.
fmt.Fprintf(GinkgoWriter, "waiting for all machines to exist, current count: %d, expected count: %d\n", len(machines), input.MachinesCount)
if len(machines) < input.MachinesCount {
return false
}

for _, machine := range machines {
if machine.Status.NodeRef == nil {
fmt.Fprintf(GinkgoWriter, "machine %s no node ref", machine.Name)
return false
}
node := &corev1.Node{}
// This should not be an Expect(), because it may return error during machine deletion.
err := input.ClusterProxy.GetWorkloadCluster(ctx, input.Cluster.Namespace, input.Cluster.Name).GetClient().Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name, Namespace: machine.Status.NodeRef.Namespace}, node)
if err != nil {
fmt.Fprintf(GinkgoWriter, "failed to get node from ref: %v", err)
return false
}
if hasMatchingUnhealthyConditions(input.MachineHealthCheck, node.Status.Conditions) {
fmt.Fprintf(GinkgoWriter, "%s has not matching unhealthy condiditon", machine.Name)
return false
}
}
return true
}, intervals...).Should(BeTrue())
}

// hasMatchingUnhealthyConditions returns true if any node condition matches with machine health check unhealthy conditions.
func hasMatchingUnhealthyConditions(machineHealthCheck *clusterv1.MachineHealthCheck, nodeConditions []corev1.NodeCondition) bool {
fmt.Fprintf(GinkgoWriter, "checking for matching unhealthy conditions, machine health check: %v, node conditions: %v\n", machineHealthCheck.Spec.UnhealthyConditions, nodeConditions)
for _, unhealthyCondition := range machineHealthCheck.Spec.UnhealthyConditions {
for _, nodeCondition := range nodeConditions {
if nodeCondition.Type == unhealthyCondition.Type && nodeCondition.Status == unhealthyCondition.Status {
return true
}
}
}
return false
}

0 comments on commit 383a574

Please sign in to comment.