diff --git a/internal/directives/simple_engine_promote.go b/internal/directives/simple_engine_promote.go index d262e83b8..0cf8fe2a4 100644 --- a/internal/directives/simple_engine_promote.go +++ b/internal/directives/simple_engine_promote.go @@ -112,29 +112,70 @@ func (e *SimpleEngine) executeSteps( // Execute the step result, err := e.executeStep(ctx, promoCtx, step, reg, workDir, state) + stepExecMeta.Status = result.Status + stepExecMeta.Message = result.Message + state[step.Alias] = result.Output + + switch result.Status { + case kargoapi.PromotionPhaseErrored, kargoapi.PromotionPhaseFailed, + kargoapi.PromotionPhaseRunning, kargoapi.PromotionPhaseSucceeded: + default: + // Deal with statuses that no step should have returned. + stepExecMeta.FinishedAt = ptr.To(metav1.Now()) + return PromotionResult{ + Status: kargoapi.PromotionPhaseErrored, + CurrentStep: i, + StepExecutionMetadata: stepExecMetas, + State: state, + HealthCheckSteps: healthChecks, + }, fmt.Errorf("step %d returned an invalid status", i) + } + + // Reconcile status and err... if err != nil { - // Let a hard error take precedence over the result status and message. - stepExecMeta.Status = kargoapi.PromotionPhaseErrored + if stepExecMeta.Status != kargoapi.PromotionPhaseFailed { + // All states other than Errored and Failed should be mutually exclusive + // with a hard error. If we got to here, a step has violated this + // assumption. We will prioritize the error over the status and change + // the status to Errored. + stepExecMeta.Status = kargoapi.PromotionPhaseErrored + } + // Let the hard error take precedence over the message. stepExecMeta.Message = err.Error() - } else { - stepExecMeta.Status = result.Status - stepExecMeta.Message = result.Message + } else if result.Status == kargoapi.PromotionPhaseErrored { + // A nil err should be mutually exclusive with an Errored status. If we + // got to here, a step has violated this assumption. We will prioritize + // the Errored status over the nil error and create an error. + message := stepExecMeta.Message + if message == "" { + message = "no details provided" + } + err = fmt.Errorf("step %d errored: %s", i, message) } - state[step.Alias] = result.Output - if stepExecMeta.Status == kargoapi.PromotionPhaseSucceeded { + // At this point, we've sorted out any discrepancies between the status and + // err. + + switch { + case stepExecMeta.Status == kargoapi.PromotionPhaseSucceeded: + // Best case scenario: The step succeeded. stepExecMeta.FinishedAt = ptr.To(metav1.Now()) if healthCheck := result.HealthCheckStep; healthCheck != nil { healthChecks = append(healthChecks, *healthCheck) } continue // Move on to the next step - } - - // Treat errors and logical failures the same for now. - // TODO(krancour): In the future, we should fail without retry for logical - // failures and unrecoverable errors and retry only those errors with a - // chance of recovery. - if stepExecMeta.Status != kargoapi.PromotionPhaseRunning { + case isTerminal(err): + // This is an unrecoverable error. + stepExecMeta.FinishedAt = ptr.To(metav1.Now()) + return PromotionResult{ + Status: stepExecMeta.Status, + CurrentStep: i, + StepExecutionMetadata: stepExecMetas, + State: state, + HealthCheckSteps: healthChecks, + }, fmt.Errorf("an unrecoverable error occurred: %w", err) + case err != nil: + // If we get to here, the error is POTENTIALLY recoverable. stepExecMeta.ErrorCount++ // Check if the error threshold has been met. errorThreshold := step.GetErrorThreshold(reg.Runner) @@ -154,8 +195,8 @@ func (e *SimpleEngine) executeSteps( } } - // If we get to here, the step is either running (waiting for some external - // condition to be met) or it errored/failed but did not meet the error + // If we get to here, the step is either Running (waiting for some external + // condition to be met) or it Errored/Failed but did not meet the error // threshold. Now we need to check if the timeout has elapsed. A nil timeout // or any non-positive timeout interval are treated as NO timeout, although // a nil timeout really shouldn't happen. @@ -172,8 +213,8 @@ func (e *SimpleEngine) executeSteps( }, fmt.Errorf("step %d timeout of %s has elapsed", i, timeout.String()) } - if stepExecMeta.Status != kargoapi.PromotionPhaseRunning { - // Treat the error/failure as if the step is still running so that the + if err != nil { + // Treat Errored/Failed as if the step is still running so that the // Promotion will be requeued. The step will be retried on the next // reconciliation. stepExecMeta.Message += "; step will be retried" @@ -186,7 +227,7 @@ func (e *SimpleEngine) executeSteps( }, nil } - // If we get to here, the step is still running (waiting for some external + // If we get to here, the step is still Running (waiting for some external // condition to be met). stepExecMeta.ErrorCount = 0 // Reset the error count return PromotionResult{ diff --git a/internal/directives/simple_engine_promote_test.go b/internal/directives/simple_engine_promote_test.go index 28625a6ac..72815e4b2 100644 --- a/internal/directives/simple_engine_promote_test.go +++ b/internal/directives/simple_engine_promote_test.go @@ -235,7 +235,35 @@ func TestSimpleEngine_executeSteps(t *testing.T) { }, }, { - name: "error on step execution; error threshold met", + name: "terminal error on step execution", + steps: []PromotionStep{ + {Kind: "success-step", Alias: "step1"}, + {Kind: "terminal-error-step", Alias: "step2"}, + }, + assertions: func(t *testing.T, result PromotionResult, err error) { + assert.ErrorContains(t, err, "an unrecoverable error occurred") + assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status) + assert.Equal(t, int64(1), result.CurrentStep) + assert.Len(t, result.StepExecutionMetadata, 2) + assert.Equal(t, kargoapi.PromotionPhaseSucceeded, result.StepExecutionMetadata[0].Status) + assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt) + assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt) + assert.Equal(t, kargoapi.PromotionPhaseErrored, result.StepExecutionMetadata[1].Status) + assert.NotNil(t, result.StepExecutionMetadata[1].StartedAt) + assert.NotNil(t, result.StepExecutionMetadata[1].FinishedAt) + assert.Contains(t, result.StepExecutionMetadata[1].Message, "something went wrong") + + // Verify first step output is preserved in state + assert.Equal(t, State{ + "step1": map[string]any{ + "key": "value", + }, + "step2": map[string]any(nil), + }, result.State) + }, + }, + { + name: "non-terminal error on step execution; error threshold met", steps: []PromotionStep{ {Kind: "success-step", Alias: "step1"}, {Kind: "error-step", Alias: "step2"}, @@ -263,7 +291,7 @@ func TestSimpleEngine_executeSteps(t *testing.T) { }, }, { - name: "error on step execution; error threshold not met", + name: "non-terminal error on step execution; error threshold not met", steps: []PromotionStep{ { Kind: "error-step", @@ -283,6 +311,37 @@ func TestSimpleEngine_executeSteps(t *testing.T) { assert.Contains(t, result.StepExecutionMetadata[0].Message, "will be retried") }, }, + { + name: "non-terminal error on step execution; timeout elapsed", + promoCtx: PromotionContext{ + StepExecutionMetadata: kargoapi.StepExecutionMetadataList{{ + // Start time is set to an hour ago + StartedAt: ptr.To(metav1.NewTime(time.Now().Add(-time.Hour))), + }}, + }, + steps: []PromotionStep{ + { + Kind: "error-step", + Retry: &kargoapi.PromotionStepRetry{ + ErrorThreshold: 3, + Timeout: &metav1.Duration{ + Duration: time.Hour, + }, + }, + }, + }, + assertions: func(t *testing.T, result PromotionResult, err error) { + assert.ErrorContains(t, err, "timeout") + assert.ErrorContains(t, err, "has elapsed") + assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status) + assert.Equal(t, int64(0), result.CurrentStep) + assert.Len(t, result.StepExecutionMetadata, 1) + assert.Equal(t, kargoapi.PromotionPhaseErrored, result.StepExecutionMetadata[0].Status) + assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt) + assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt) + assert.Equal(t, uint32(1), result.StepExecutionMetadata[0].ErrorCount) + }, + }, { name: "step is still running; timeout elapsed", promoCtx: PromotionContext{ @@ -306,6 +365,10 @@ func TestSimpleEngine_executeSteps(t *testing.T) { assert.ErrorContains(t, err, "has elapsed") assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status) assert.Equal(t, int64(0), result.CurrentStep) + assert.Len(t, result.StepExecutionMetadata, 1) + assert.Equal(t, kargoapi.PromotionPhaseRunning, result.StepExecutionMetadata[0].Status) + assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt) + assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt) }, }, { @@ -318,13 +381,12 @@ func TestSimpleEngine_executeSteps(t *testing.T) { assert.Len(t, result.StepExecutionMetadata, 1) assert.Equal(t, kargoapi.PromotionPhaseRunning, result.StepExecutionMetadata[0].Status) assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt) + assert.Nil(t, result.StepExecutionMetadata[0].FinishedAt) }, }, { - name: "context cancellation", - steps: []PromotionStep{ - {Kind: "context-waiter", Alias: "step1"}, - }, + name: "context cancellation", + steps: []PromotionStep{{Kind: "context-waiter"}}, assertions: func(t *testing.T, result PromotionResult, err error) { assert.ErrorContains(t, err, "met error threshold") assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status) @@ -369,6 +431,14 @@ func TestSimpleEngine_executeSteps(t *testing.T) { }, &StepRunnerPermissions{}, ) + testRegistry.RegisterPromotionStepRunner( + &mockPromotionStepRunner{ + name: "terminal-error-step", + runResult: PromotionStepResult{Status: kargoapi.PromotionPhaseErrored}, + runErr: &terminalError{err: errors.New("something went wrong")}, + }, + &StepRunnerPermissions{}, + ) testRegistry.RegisterPromotionStepRunner( &mockPromotionStepRunner{ name: "context-waiter",