From 69a3adad2146df6d9ea0dbb81358d2db22770cd9 Mon Sep 17 00:00:00 2001 From: Mathieu Fenniak Date: Fri, 15 Aug 2025 19:11:57 +0000 Subject: [PATCH] fix: race condition in matrix job result state may result in failed jobs being marked as successful (#862) In `setJobResult` there is no coordination between multiple jobs that are completing, leading to a possible condition where `jobResult` can be read from the matrix job as `"success"` by a job, marked as `"failed"` by another job, and then marked as `"success"` by other jobs. To my knowledge, the race condition has not been observed in a real-world case, but has been reproduced in a unit test. ``` ================== WARNING: DATA RACE Read at 0x00c0006d08a0 by goroutine 29232: code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult() /.../forgejo-runner/act/runner/job_executor.go:173 +0x359 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6() /.../forgejo-runner/act/runner/job_executor.go:118 +0x15d code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14() /.../forgejo-runner/act/common/executor.go:183 +0x86 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7() /.../forgejo-runner/act/runner/job_executor.go:161 +0x191 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16() /.../forgejo-runner/act/common/executor.go:183 +0x86 ... Previous write at 0x00c0006d08a0 by goroutine 29234: code.forgejo.org/forgejo/runner/v9/act/runner.(*RunContext).result() /.../forgejo-runner/act/runner/run_context.go:897 +0x271 code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult() /.../forgejo-runner/act/runner/job_executor.go:181 +0x66e code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6() /.../forgejo-runner/act/runner/job_executor.go:118 +0x15d code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14() /.../forgejo-runner/act/common/executor.go:183 +0x86 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7() /.../forgejo-runner/act/runner/job_executor.go:161 +0x191 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16() /.../forgejo-runner/act/common/executor.go:183 +0x86 ... ================== ``` - bug fixes - [PR](https://code.forgejo.org/forgejo/runner/pulls/862): fix: race condition in matrix job result state may result in failed jobs being marked as successful Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/862 Reviewed-by: earl-warren Co-authored-by: Mathieu Fenniak Co-committed-by: Mathieu Fenniak --- act/model/workflow.go | 5 ++- act/runner/job_executor.go | 5 +++ act/runner/job_executor_test.go | 73 +++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/act/model/workflow.go b/act/model/workflow.go index 82082feb..3462d288 100644 --- a/act/model/workflow.go +++ b/act/model/workflow.go @@ -11,6 +11,7 @@ import ( "slices" "strconv" "strings" + "sync" "code.forgejo.org/forgejo/runner/v9/act/common" "code.forgejo.org/forgejo/runner/v9/act/schema" @@ -217,7 +218,9 @@ type Job struct { Uses string `yaml:"uses"` With map[string]any `yaml:"with"` RawSecrets yaml.Node `yaml:"secrets"` - Result string + + Result string + ResultMutex sync.Mutex } // Strategy for the job diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go index d407b379..3cb4f206 100644 --- a/act/runner/job_executor.go +++ b/act/runner/job_executor.go @@ -162,6 +162,11 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success bool) { logger := common.Logger(ctx) + // As we're reading the matrix build's status (`rc.Run.Job().Result`), it's possible for it change in another + // goroutine running `setJobResult` and invoking `.result(...)`. Prevent concurrent execution of `setJobResult`... + rc.Run.Job().ResultMutex.Lock() + defer rc.Run.Job().ResultMutex.Unlock() + jobResult := "success" // we have only one result for a whole matrix build, so we need // to keep an existing result state if we run a matrix diff --git a/act/runner/job_executor_test.go b/act/runner/job_executor_test.go index 6e8a452e..3915b971 100644 --- a/act/runner/job_executor_test.go +++ b/act/runner/job_executor_test.go @@ -5,7 +5,9 @@ import ( "fmt" "io" "slices" + "sync" "testing" + "time" "code.forgejo.org/forgejo/runner/v9/act/common" "code.forgejo.org/forgejo/runner/v9/act/container" @@ -333,3 +335,74 @@ func TestJobExecutorNewJobExecutor(t *testing.T) { }) } } + +func TestSetJobResultConcurrency(t *testing.T) { + jim := &jobInfoMock{} + job := model.Job{ + Result: "success", + } + // Distinct RunContext objects are used to replicate realistic setJobResult in matrix build + rc1 := &RunContext{ + Run: &model.Run{ + JobID: "test", + Workflow: &model.Workflow{ + Jobs: map[string]*model.Job{ + "test": &job, + }, + }, + }, + } + rc2 := &RunContext{ + Run: &model.Run{ + JobID: "test", + Workflow: &model.Workflow{ + Jobs: map[string]*model.Job{ + "test": &job, + }, + }, + }, + } + + jim.On("matrix").Return(map[string]interface{}{ + "python": []string{"3.10", "3.11", "3.12"}, + }) + + // Synthesize a race condition in setJobResult where, by reading data from the job matrix earlier and then + // performing unsynchronzied writes to the same shared data structure, it can overwrite a failure status. + // + // Goroutine 1: Start marking job as success + // (artificially suspended + // by result() mock) + // Goroutine 2: Mark job as failure + // Goroutine 1: Finish marking job as success + // + // Correct behavior: Job is marked as a failure + // Bug behavior: Job is marked as a success + + var lastResult string + jim.On("result", mock.Anything).Run(func(args mock.Arguments) { + result := args.String(0) + // Artificially suspend the "success" case so that the failure case races past it. + if result == "success" { + time.Sleep(1 * time.Second) + } + job.Result = result + lastResult = result + }) + + var wg sync.WaitGroup + wg.Add(2) + // Goroutine 1, mark as success: + go func() { + defer wg.Done() + setJobResult(t.Context(), jim, rc1, true) + }() + // Goroutine 2, mark as failure: + go func() { + defer wg.Done() + setJobResult(t.Context(), jim, rc2, false) + }() + wg.Wait() + + assert.Equal(t, "failure", lastResult) +}