From 63351343ba07fa11763a03768e615ab6e1c9b39f Mon Sep 17 00:00:00 2001 From: Mathieu Fenniak Date: Thu, 2 Oct 2025 22:43:50 +0000 Subject: [PATCH] fix: improve logging to diagnose mystery job terminations (#1048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Additional logging to support #1044. Manual testing only. Cases tested: Cancel a job from Forgejo UI; this seems like the most likely missing piece in #1044 as two jobs were simultaneously marked as "Failed". There are codepaths in Forgejo that can set this state to both cancelled and failed, but the runner didn't provide log output indicating that's why a job was stopping: ``` time="2025-10-02T13:22:53-06:00" level=info msg="UpdateTask returned task result RESULT_CANCELLED for a task that was in local state RESULT_UNSPECIFIED - beginning local task termination" func="[ReportState]" file="[reporter.go:410]" ``` Host-based executor hits step timeout in exec, or, is cancelled. This occurred but only logged the `err` from `exec`, not the context error indicating whether it was a timeout or a cancellation: ``` [Test Action/job1] this step has been cancelled: ctx: context deadline exceeded, exec: RUN signal: killed [Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed ``` Unable to `ReportState` due to Forgejo inaccessible. If the runner isn't able to update state to Forgejo a job could be considered a zombie; this would trigger one of the codepaths where the job would be marked as failed. If connectivity was later restored, then the runner could identify it was marked as failed and cancel the job context. (This combination doesn't seem likely, but, I think it's reasonable to consider these failures as warnings because there may be unexpected errors here that we're not aware of). ``` time="2025-10-02T13:27:19-06:00" level=warning msg="ReportState error: unavailable: 502 Bad Gateway" func="[RunDaemon]" file="[reporter.go:207]" ``` Runner shutdown logging; just changed up to `Info` level: ``` time="2025-10-02T13:31:36-06:00" level=info msg="forcing the jobs to shutdown" func="[Shutdown]" file="[poller.go:93]" [Test Action/job1] ❌ Failure - Main sleep 120 [Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed ``` - bug fixes - [PR](https://code.forgejo.org/forgejo/runner/pulls/1048): fix: improve logging to diagnose mystery job terminations Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/1048 Reviewed-by: earl-warren Co-authored-by: Mathieu Fenniak Co-committed-by: Mathieu Fenniak --- act/container/host_environment.go | 2 +- act/runner/run_context.go | 2 +- act/runner/step.go | 6 +++--- internal/app/poll/poller.go | 4 ++-- internal/pkg/report/reporter.go | 24 +++++++++++++++++++++--- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 405862a9..c8742008 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -389,7 +389,7 @@ func (e *HostEnvironment) ExecWithCmdLine(command []string, cmdline string, env if err := e.exec(ctx, command, cmdline, env, user, workdir); err != nil { select { case <-ctx.Done(): - return fmt.Errorf("this step has been cancelled: %w", err) + return fmt.Errorf("this step has been cancelled: ctx: %w, exec: %w", ctx.Err(), err) default: return err } diff --git a/act/runner/run_context.go b/act/runner/run_context.go index f344a046..bb79ad3a 100644 --- a/act/runner/run_context.go +++ b/act/runner/run_context.go @@ -949,7 +949,7 @@ func (rc *RunContext) Executor() (common.Executor, error) { return err } if res { - timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, rc.Run.Job().TimeoutMinutes) + timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "job", rc.ExprEval, rc.Run.Job().TimeoutMinutes) defer cancelTimeOut() return executor(timeoutctx) diff --git a/act/runner/step.go b/act/runner/step.go index 322b6f7e..c93291c4 100644 --- a/act/runner/step.go +++ b/act/runner/step.go @@ -177,7 +177,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo Mode: 0o666, })(ctx) - timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, stepModel.TimeoutMinutes) + timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "step", rc.ExprEval, stepModel.TimeoutMinutes) defer cancelTimeOut() err = executor(timeoutctx) @@ -213,12 +213,12 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo } } -func evaluateTimeout(ctx context.Context, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) { +func evaluateTimeout(ctx context.Context, contextType string, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) { timeout := exprEval.Interpolate(ctx, timeoutMinutes) if timeout != "" { timeOutMinutes, err := strconv.ParseInt(timeout, 10, 64) if err == nil { - common.Logger(ctx).Debugf("the step will stop in timeout-minutes %s", timeout) + common.Logger(ctx).Debugf("the %s will stop in timeout-minutes %s", contextType, timeout) return context.WithTimeout(ctx, time.Duration(timeOutMinutes)*time.Minute) } common.Logger(ctx).Errorf("timeout-minutes %s cannot be parsed and will be ignored: %w", timeout, err) diff --git a/internal/app/poll/poller.go b/internal/app/poll/poller.go index 7b954cb7..c880bc67 100644 --- a/internal/app/poll/poller.go +++ b/internal/app/poll/poller.go @@ -90,10 +90,10 @@ func (p *poller) Shutdown(ctx context.Context) error { return nil case <-ctx.Done(): - log.Trace("forcing the jobs to shutdown") + log.Info("forcing the jobs to shutdown") p.shutdownJobs() <-p.done - log.Trace("all jobs have been shutdown") + log.Info("all jobs have been shutdown") return ctx.Err() } } diff --git a/internal/pkg/report/reporter.go b/internal/pkg/report/reporter.go index 7e46b8bb..804744cc 100644 --- a/internal/pkg/report/reporter.go +++ b/internal/pkg/report/reporter.go @@ -48,6 +48,7 @@ type Reporter struct { debugOutputEnabled bool stopCommandEndToken string + issuedLocalCancel bool } func NewReporter(ctx context.Context, cancel context.CancelFunc, c client.Client, task *runnerv1.Task, reportInterval time.Duration) *Reporter { @@ -192,11 +193,19 @@ func (r *Reporter) RunDaemon() { return } if r.ctx.Err() != nil { + // This shouldn't happen because DaemonContext is used for `r.ctx` which should outlive any running job. + log.Warnf("Terminating RunDaemon on an active job due to error: %v", r.ctx.Err()) return } - _ = r.ReportLog(false) - _ = r.ReportState() + err := r.ReportLog(false) + if err != nil { + log.Warnf("ReportLog error: %v", err) + } + err = r.ReportState() + if err != nil { + log.Warnf("ReportState error: %v", err) + } time.AfterFunc(r.reportInterval, r.RunDaemon) } @@ -391,8 +400,17 @@ func (r *Reporter) ReportState() error { r.outputs.Store(k, struct{}{}) } - switch resp.Msg.GetState().GetResult() { + localResultState := state.GetResult() + remoteResultState := resp.Msg.GetState().GetResult() + switch remoteResultState { case runnerv1.Result_RESULT_CANCELLED, runnerv1.Result_RESULT_FAILURE: + // issuedLocalCancel is just used to deduplicate this log message if our local state doesn't catch up with our + // remote state as quickly as the report-interval, which would cause this message to repeat in the logs. + if !r.issuedLocalCancel && remoteResultState != localResultState { + log.Infof("UpdateTask returned task result %v for a task that was in local state %v - beginning local task termination", + remoteResultState, localResultState) + r.issuedLocalCancel = true + } r.cancel() }