1
0
Fork 0
mirror of https://code.forgejo.org/forgejo/runner.git synced 2025-10-15 19:42:06 +00:00

fix: improve logging to diagnose mystery job terminations (#1048)

Additional logging to support #1044.

Manual testing only.  Cases tested:

Cancel a job from Forgejo UI; this seems like the most likely missing piece in #1044 as two jobs were simultaneously marked as "Failed".  There are codepaths in Forgejo that can set this state to both cancelled and failed, but the runner didn't provide log output indicating that's why a job was stopping:
```
time="2025-10-02T13:22:53-06:00" level=info msg="UpdateTask returned task result RESULT_CANCELLED for a task that was in local state RESULT_UNSPECIFIED - beginning local task termination" func="[ReportState]" file="[reporter.go:410]"
```

Host-based executor hits step timeout in exec, or, is cancelled.  This occurred but only logged the `err` from `exec`, not the context error indicating whether it was a timeout or a cancellation:
```
[Test Action/job1] this step has been cancelled: ctx: context deadline exceeded, exec: RUN signal: killed
[Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed
```

Unable to `ReportState` due to Forgejo inaccessible.  If the runner isn't able to update state to Forgejo a job could be considered a zombie; this would trigger one of the codepaths where the job would be marked as failed.  If connectivity was later restored, then the runner could identify it was marked as failed and cancel the job context.  (This combination doesn't seem likely, but, I think it's reasonable to consider these failures as warnings because there may be unexpected errors here that we're not aware of).
```
time="2025-10-02T13:27:19-06:00" level=warning msg="ReportState error: unavailable: 502 Bad Gateway" func="[RunDaemon]" file="[reporter.go:207]"
```

Runner shutdown logging; just changed up to `Info` level:
```
time="2025-10-02T13:31:36-06:00" level=info msg="forcing the jobs to shutdown" func="[Shutdown]" file="[poller.go:93]"
[Test Action/job1]     Failure - Main sleep 120
[Test Action/job1] this step has been cancelled: ctx: context canceled, exec: RUN signal: killed
```

<!--start release-notes-assistant-->
<!--URL:https://code.forgejo.org/forgejo/runner-->
- bug fixes
  - [PR](https://code.forgejo.org/forgejo/runner/pulls/1048): <!--number 1048 --><!--line 0 --><!--description Zml4OiBpbXByb3ZlIGxvZ2dpbmcgdG8gZGlhZ25vc2UgbXlzdGVyeSBqb2IgdGVybWluYXRpb25z-->fix: improve logging to diagnose mystery job terminations<!--description-->
<!--end release-notes-assistant-->

Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/1048
Reviewed-by: earl-warren <earl-warren@noreply.code.forgejo.org>
Co-authored-by: Mathieu Fenniak <mathieu@fenniak.net>
Co-committed-by: Mathieu Fenniak <mathieu@fenniak.net>
This commit is contained in:
Mathieu Fenniak 2025-10-02 22:43:50 +00:00 committed by earl-warren
parent 01766ff4e2
commit 63351343ba
No known key found for this signature in database
GPG key ID: F128CBE6AB3A7201
5 changed files with 28 additions and 10 deletions

View file

@ -389,7 +389,7 @@ func (e *HostEnvironment) ExecWithCmdLine(command []string, cmdline string, env
if err := e.exec(ctx, command, cmdline, env, user, workdir); err != nil {
select {
case <-ctx.Done():
return fmt.Errorf("this step has been cancelled: %w", err)
return fmt.Errorf("this step has been cancelled: ctx: %w, exec: %w", ctx.Err(), err)
default:
return err
}

View file

@ -949,7 +949,7 @@ func (rc *RunContext) Executor() (common.Executor, error) {
return err
}
if res {
timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, rc.Run.Job().TimeoutMinutes)
timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "job", rc.ExprEval, rc.Run.Job().TimeoutMinutes)
defer cancelTimeOut()
return executor(timeoutctx)

View file

@ -177,7 +177,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
Mode: 0o666,
})(ctx)
timeoutctx, cancelTimeOut := evaluateTimeout(ctx, rc.ExprEval, stepModel.TimeoutMinutes)
timeoutctx, cancelTimeOut := evaluateTimeout(ctx, "step", rc.ExprEval, stepModel.TimeoutMinutes)
defer cancelTimeOut()
err = executor(timeoutctx)
@ -213,12 +213,12 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo
}
}
func evaluateTimeout(ctx context.Context, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) {
func evaluateTimeout(ctx context.Context, contextType string, exprEval ExpressionEvaluator, timeoutMinutes string) (context.Context, context.CancelFunc) {
timeout := exprEval.Interpolate(ctx, timeoutMinutes)
if timeout != "" {
timeOutMinutes, err := strconv.ParseInt(timeout, 10, 64)
if err == nil {
common.Logger(ctx).Debugf("the step will stop in timeout-minutes %s", timeout)
common.Logger(ctx).Debugf("the %s will stop in timeout-minutes %s", contextType, timeout)
return context.WithTimeout(ctx, time.Duration(timeOutMinutes)*time.Minute)
}
common.Logger(ctx).Errorf("timeout-minutes %s cannot be parsed and will be ignored: %w", timeout, err)

View file

@ -90,10 +90,10 @@ func (p *poller) Shutdown(ctx context.Context) error {
return nil
case <-ctx.Done():
log.Trace("forcing the jobs to shutdown")
log.Info("forcing the jobs to shutdown")
p.shutdownJobs()
<-p.done
log.Trace("all jobs have been shutdown")
log.Info("all jobs have been shutdown")
return ctx.Err()
}
}

View file

@ -48,6 +48,7 @@ type Reporter struct {
debugOutputEnabled bool
stopCommandEndToken string
issuedLocalCancel bool
}
func NewReporter(ctx context.Context, cancel context.CancelFunc, c client.Client, task *runnerv1.Task, reportInterval time.Duration) *Reporter {
@ -192,11 +193,19 @@ func (r *Reporter) RunDaemon() {
return
}
if r.ctx.Err() != nil {
// This shouldn't happen because DaemonContext is used for `r.ctx` which should outlive any running job.
log.Warnf("Terminating RunDaemon on an active job due to error: %v", r.ctx.Err())
return
}
_ = r.ReportLog(false)
_ = r.ReportState()
err := r.ReportLog(false)
if err != nil {
log.Warnf("ReportLog error: %v", err)
}
err = r.ReportState()
if err != nil {
log.Warnf("ReportState error: %v", err)
}
time.AfterFunc(r.reportInterval, r.RunDaemon)
}
@ -391,8 +400,17 @@ func (r *Reporter) ReportState() error {
r.outputs.Store(k, struct{}{})
}
switch resp.Msg.GetState().GetResult() {
localResultState := state.GetResult()
remoteResultState := resp.Msg.GetState().GetResult()
switch remoteResultState {
case runnerv1.Result_RESULT_CANCELLED, runnerv1.Result_RESULT_FAILURE:
// issuedLocalCancel is just used to deduplicate this log message if our local state doesn't catch up with our
// remote state as quickly as the report-interval, which would cause this message to repeat in the logs.
if !r.issuedLocalCancel && remoteResultState != localResultState {
log.Infof("UpdateTask returned task result %v for a task that was in local state %v - beginning local task termination",
remoteResultState, localResultState)
r.issuedLocalCancel = true
}
r.cancel()
}