Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a bug for etcd removal machine controller [Etcd Proxy] #125

Merged
merged 1 commit into from
May 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 101 additions & 14 deletions controlplane/controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ import (
"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/controllers/external"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -19,6 +22,13 @@ import (
k3s "github.com/k3s-io/cluster-api-k3s/pkg/k3s"
)

var (
errNilNodeRef = errors.New("noderef is nil")
errNoControlPlaneNodes = errors.New("no control plane members")
errClusterIsBeingDeleted = errors.New("cluster is being deleted")
errControlPlaneIsBeingDeleted = errors.New("control plane is being deleted")
)

// KThreesControlPlaneReconciler reconciles a KThreesControlPlane object.
type MachineReconciler struct {
client.Client
Expand Down Expand Up @@ -91,24 +101,45 @@ func (r *MachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return ctrl.Result{}, errors.Wrapf(err, "unable to get cluster")
}

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
err = r.isRemoveEtcdMemberNeeded(ctx, cluster, m)
isRemoveEtcdMemberNeeded := err == nil
if err != nil {
logger.Error(err, "failed to create client to workload cluster")
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
switch err {
case errNoControlPlaneNodes, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
nodeName := ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}
logger.Info("Skipping removal for etcd member associated with Machine as it is not needed", "Node", klog.KRef("", nodeName), "cause", err.Error())
default:
return ctrl.Result{}, errors.Wrapf(err, "failed to check if k3s etcd member remove is needed")
}
}

etcdRemoved, err := workloadCluster.RemoveEtcdMemberForMachine(ctx, m)
if err != nil {
logger.Error(err, "failed to remove etcd member for machine")
return ctrl.Result{}, err
if isRemoveEtcdMemberNeeded {
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
logger.Error(err, "failed to create client to workload cluster")
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}

etcdRemoved, err := workloadCluster.RemoveEtcdMemberForMachine(ctx, m)
if err != nil {
logger.Error(err, "failed to remove etcd member for machine")
return ctrl.Result{}, err
}
if !etcdRemoved {
logger.Info("wait k3s embedded etcd controller to remove etcd")
return ctrl.Result{Requeue: true}, err
}

nodeName := ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}

logger.Info("etcd remove etcd member succeeded", "Node", klog.KRef("", nodeName))
}
if !etcdRemoved {
logger.Info("wait k3s embedded etcd controller to remove etcd")
return ctrl.Result{Requeue: true}, err
}

// It is possible that the machine has no machine ref yet, will record the machine name in log
logger.Info("etcd remove etcd member succeeded", "machine name", m.Name)

patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
Expand All @@ -125,3 +156,59 @@ func (r *MachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct

return ctrl.Result{}, nil
}

// isRemoveEtcdMemberNeeded returns nil if the Machine's NodeRef is not nil
// and if the Machine is not the last control plane node in the cluster
// and if the Cluster/KThreesControlplane associated with the Machine is not deleted.
func (r *MachineReconciler) isRemoveEtcdMemberNeeded(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error {
log := ctrl.LoggerFrom(ctx)
// Return early if the cluster is being deleted.
if !cluster.DeletionTimestamp.IsZero() {
return errClusterIsBeingDeleted
}

// Cannot remove etcd member if the node doesn't exist.
if machine.Status.NodeRef == nil {
return errNilNodeRef
}

// controlPlaneRef is an optional field in the Cluster so skip the external
// managed control plane check if it is nil
if cluster.Spec.ControlPlaneRef != nil {
controlPlane, err := external.Get(ctx, r.Client, cluster.Spec.ControlPlaneRef, cluster.Spec.ControlPlaneRef.Namespace)
if apierrors.IsNotFound(err) {
// If control plane object in the reference does not exist, log and skip check for
// external managed control plane
log.Error(err, "control plane object specified in cluster spec.controlPlaneRef does not exist", "kind", cluster.Spec.ControlPlaneRef.Kind, "name", cluster.Spec.ControlPlaneRef.Name)
} else {
if err != nil {
// If any other error occurs when trying to get the control plane object,
// return the error so we can retry
return err
}

// Return early if the object referenced by controlPlaneRef is being deleted.
if !controlPlane.GetDeletionTimestamp().IsZero() {
return errControlPlaneIsBeingDeleted
}
}
}

// Get all of the active machines that belong to this cluster.
machines, err := collections.GetFilteredMachinesForCluster(ctx, r.Client, cluster, collections.ActiveMachines)
if err != nil {
return err
}

// Whether or not it is okay to remove etcd member depends on the
// number of remaining control plane members and whether or not this
// machine is one of them.
numControlPlaneMachines := len(machines.Filter(collections.ControlPlaneMachines(cluster.Name)))
if numControlPlaneMachines == 0 {
// Do not remove etcd member if there are no remaining members of
// the control plane.
return errNoControlPlaneNodes
}
// Otherwise it is okay to remove etcd member.
return nil
}
Loading