Skip to content

Commit

Permalink
Change app status to rollback on job start timeout (#298)
Browse files Browse the repository at this point in the history
Once a new Flink job is submitted, it’s possible that the job vertices
are not in the running state by the configured timeout period of 3
minutes.

There exists a bug in the flinkk8scontroller where the app will get
stuck in this phase, instead of being rolled back and marked as
DeployFailed.

Slack
[thread](https://lyft.slack.com/archives/CPN3NG47M/p1701461663432759?thread_ts=1701458116.814489&cid=CPN3NG47M)
  • Loading branch information
avneet15 authored Dec 19, 2023
1 parent e0a2885 commit 173badd
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
4 changes: 3 additions & 1 deletion pkg/controller/flinkapplication/flink_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,9 @@ func (s *FlinkStateMachine) handleSubmittingJob(ctx context.Context, app *v1beta
if err != nil {
logger.Info(ctx, "Job monitoring failed with error: %v", err)
s.flinkController.LogEvent(ctx, app, corev1.EventTypeWarning, "JobMonitoringFailed", err.Error())
return statusUnchanged, err
s.flinkController.UpdateLatestJobID(ctx, app, "")
s.updateApplicationPhase(app, v1beta1.FlinkApplicationRollingBackJob)
return statusChanged, err
}
if jobStarted {
return updateJobAndReturn(ctx, s, app, hash)
Expand Down
14 changes: 10 additions & 4 deletions pkg/controller/flinkapplication/flink_state_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -759,13 +759,16 @@ func TestSubmittingVertexFailsToStart(t *testing.T) {
mockK8Cluster.UpdateStatusFunc = func(ctx context.Context, object k8sclient.Object) error {
if statusUpdateCount == 1 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, jobID, mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, "", mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, v1beta1.FlinkApplicationRollingBackJob, application.Status.Phase)
} else if statusUpdateCount == 2 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, "", mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, v1beta1.FlinkApplicationDeployFailed, application.Status.Phase)
} else if statusUpdateCount == 3 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, v1beta1.FlinkApplicationRollingBackJob, application.Status.Phase)
assert.Equal(t, v1beta1.FlinkApplicationDeployFailed, application.Status.Phase)
assert.Equal(t, jobID, mockFlinkController.GetLatestJobID(ctx, application))
}
statusUpdateCount++
return nil
Expand Down Expand Up @@ -925,13 +928,16 @@ func TestSubmittingVertexStartTimeout(t *testing.T) {
mockK8Cluster.UpdateStatusFunc = func(ctx context.Context, object k8sclient.Object) error {
if statusUpdateCount == 1 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, jobID, mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, "", mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, v1beta1.FlinkApplicationRollingBackJob, application.Status.Phase)
} else if statusUpdateCount == 2 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, "", mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, v1beta1.FlinkApplicationDeployFailed, application.Status.Phase)
} else if statusUpdateCount == 3 {
application := object.(*v1beta1.FlinkApplication)
assert.Equal(t, v1beta1.FlinkApplicationRollingBackJob, application.Status.Phase)
assert.Equal(t, "", mockFlinkController.GetLatestJobID(ctx, application))
assert.Equal(t, v1beta1.FlinkApplicationDeployFailed, application.Status.Phase)
}
statusUpdateCount++
return nil
Expand Down

0 comments on commit 173badd

Please sign in to comment.