From 63351343ba07fa11763a03768e615ab6e1c9b39f Mon Sep 17 00:00:00 2001
From: Mathieu Fenniak <mathieu@fenniak.net>
Date: Thu, 2 Oct 2025 22:43:50 +0000
Subject: [PATCH] fix: improve logging to diagnose mystery job terminations
 (#1048)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Additional logging to support #1044.

Manual testing only.  Cases tested:

Cancel a job from Forgejo UI; this seems like the most likely missing piece in #1044 as two jobs were simultaneously marked as "Failed".  There are codepaths in Forgejo that can set this state to both cancelled and failed, but the runner didn't provide log output indicating that's why a job was stopping:
```
time="2025-10-02T13:22:53-06:00" level=info msg="UpdateTask returned task result RESULT_CANCELLED for a task that was in local state RESULT_UNSPECIFIED - beginning local task termination" func="[ReportState]" file="[reporter.go:410]"
```

Host-based executor hits step timeout in exec, or, is cancelled.  This occurred but only logged the `err` from `exec`, not the context error indicating whether it was a timeout or a cancellation:
```
[Test Action/job1] this step has been cancelled: ctx: context deadline exceeded, exec: RUN signal: killed
[Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed
```

Unable to `ReportState` due to Forgejo inaccessible.  If the runner isn't able to update state to Forgejo a job could be considered a zombie; this would trigger one of the codepaths where the job would be marked as failed.  If connectivity was later restored, then the runner could identify it was marked as failed and cancel the job context.  (This combination doesn't seem likely, but, I think it's reasonable to consider these failures as warnings because there may be unexpected errors here that we're not aware of).
```
time="2025-10-02T13:27:19-06:00" level=warning msg="ReportState error: unavailable: 502 Bad Gateway" func="[RunDaemon]" file="[reporter.go:207]"
```

Runner shutdown logging; just changed up to `Info` level:
```
time="2025-10-02T13:31:36-06:00" level=info msg="forcing the jobs to shutdown" func="[Shutdown]" file="[poller.go:93]"
[Test Action/job1]   ❌  Failure - Main sleep 120
[Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed
```

<!--start release-notes-assistant-->
<!--URL:https://code.forgejo.org/forgejo/runner-->
- bug fixes
  - [PR](https://code.forgejo.org/forgejo/runner/pulls/1048): <!--number 1048 --><!--line 0 --><!--description Zml4OiBpbXByb3ZlIGxvZ2dpbmcgdG8gZGlhZ25vc2UgbXlzdGVyeSBqb2IgdGVybWluYXRpb25z-->fix: improve logging to diagnose mystery job terminations<!--description-->
<!--end release-notes-assistant-->

Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/1048
Reviewed-by: earl-warren <earl-warren@noreply.code.forgejo.org>
Co-authored-by: Mathieu Fenniak <mathieu@fenniak.net>
Co-committed-by: Mathieu Fenniak <mathieu@fenniak.net>
---
 act/container/host_environment.go |  2 +-
 act/runner/run_context.go         |  2 +-
 act/runner/step.go                |  6 +++---
 internal/app/poll/poller.go       |  4 ++--
 internal/pkg/report/reporter.go   | 24 +++++++++++++++++++++---
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/act/container/host_environment.go b/act/container/host_environment.go
index 405862a9..c8742008 100644
--- a/act/container/host_environment.go
+++ b/act/container/host_environment.go
@@ -389,7 +389,7 @@ func (e *HostEnvironment) ExecWithCmdLine(command []string, cmdline string, env
 		if err := e.exec(ctx, command, cmdline, env, user, workdir); err != nil {
 			select {
 			case <-ctx.Done():
-				return fmt.Errorf("this step has been cancelled: %w", err)
+				return fmt.Errorf("this step has been cancelled: ctx: %w, exec: %w", ctx.Err(), err)
 			default:
 				return err
 			}
diff --git a/act/runner/run_context.go b/act/runner/run_context.go
index f344a046..bb79ad3a 100644
--- a/act/runner/run_context.go
+++ b/act/runner/run_context.go
@@ -949,7 +949,7 @@ func (rc *RunContext) Executor() (common.Executor, error) {
 			return err
 		}
 		if res {
-			timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, rc.Run.Job().TimeoutMinutes)
+			timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "job", rc.ExprEval, rc.Run.Job().TimeoutMinutes)
 			defer cancelTimeOut()
 
 			return executor(timeoutctx)
diff --git a/act/runner/step.go b/act/runner/step.go
index 322b6f7e..c93291c4 100644
--- a/act/runner/step.go
+++ b/act/runner/step.go
@@ -177,7 +177,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
 			Mode: 0o666,
 		})(ctx)
 
-		timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, stepModel.TimeoutMinutes)
+		timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "step", rc.ExprEval, stepModel.TimeoutMinutes)
 		defer cancelTimeOut()
 		err = executor(timeoutctx)
 
@@ -213,12 +213,12 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
 	}
 }
 
-func evaluateTimeout(ctx context.Context, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) {
+func evaluateTimeout(ctx context.Context, contextType string, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) {
 	timeout := exprEval.Interpolate(ctx, timeoutMinutes)
 	if timeout != "" {
 		timeOutMinutes, err := strconv.ParseInt(timeout, 10, 64)
 		if err == nil {
-			common.Logger(ctx).Debugf("the step will stop in timeout-minutes %s", timeout)
+			common.Logger(ctx).Debugf("the %s will stop in timeout-minutes %s", contextType, timeout)
 			return context.WithTimeout(ctx, time.Duration(timeOutMinutes)*time.Minute)
 		}
 		common.Logger(ctx).Errorf("timeout-minutes %s cannot be parsed and will be ignored: %w", timeout, err)
diff --git a/internal/app/poll/poller.go b/internal/app/poll/poller.go
index 7b954cb7..c880bc67 100644
--- a/internal/app/poll/poller.go
+++ b/internal/app/poll/poller.go
@@ -90,10 +90,10 @@ func (p *poller) Shutdown(ctx context.Context) error {
 		return nil
 
 	case <-ctx.Done():
-		log.Trace("forcing the jobs to shutdown")
+		log.Info("forcing the jobs to shutdown")
 		p.shutdownJobs()
 		<-p.done
-		log.Trace("all jobs have been shutdown")
+		log.Info("all jobs have been shutdown")
 		return ctx.Err()
 	}
 }
diff --git a/internal/pkg/report/reporter.go b/internal/pkg/report/reporter.go
index 7e46b8bb..804744cc 100644
--- a/internal/pkg/report/reporter.go
+++ b/internal/pkg/report/reporter.go
@@ -48,6 +48,7 @@ type Reporter struct {
 
 	debugOutputEnabled  bool
 	stopCommandEndToken string
+	issuedLocalCancel   bool
 }
 
 func NewReporter(ctx context.Context, cancel context.CancelFunc, c client.Client, task *runnerv1.Task, reportInterval time.Duration) *Reporter {
@@ -192,11 +193,19 @@ func (r *Reporter) RunDaemon() {
 		return
 	}
 	if r.ctx.Err() != nil {
+		// This shouldn't happen because DaemonContext is used for `r.ctx` which should outlive any running job.
+		log.Warnf("Terminating RunDaemon on an active job due to error: %v", r.ctx.Err())
 		return
 	}
 
-	_ = r.ReportLog(false)
-	_ = r.ReportState()
+	err := r.ReportLog(false)
+	if err != nil {
+		log.Warnf("ReportLog error: %v", err)
+	}
+	err = r.ReportState()
+	if err != nil {
+		log.Warnf("ReportState error: %v", err)
+	}
 
 	time.AfterFunc(r.reportInterval, r.RunDaemon)
 }
@@ -391,8 +400,17 @@ func (r *Reporter) ReportState() error {
 		r.outputs.Store(k, struct{}{})
 	}
 
-	switch resp.Msg.GetState().GetResult() {
+	localResultState := state.GetResult()
+	remoteResultState := resp.Msg.GetState().GetResult()
+	switch remoteResultState {
 	case runnerv1.Result_RESULT_CANCELLED, runnerv1.Result_RESULT_FAILURE:
+		// issuedLocalCancel is just used to deduplicate this log message if our local state doesn't catch up with our
+		// remote state as quickly as the report-interval, which would cause this message to repeat in the logs.
+		if !r.issuedLocalCancel && remoteResultState != localResultState {
+			log.Infof("UpdateTask returned task result %v for a task that was in local state %v - beginning local task termination",
+				remoteResultState, localResultState)
+			r.issuedLocalCancel = true
+		}
 		r.cancel()
 	}