mirror of
https://code.forgejo.org/forgejo/runner.git
synced 2025-09-05 18:40:59 +00:00
fix: race condition in matrix job result state may result in failed jobs being marked as successful (#862)
In `setJobResult` there is no coordination between multiple jobs that are completing, leading to a possible condition where `jobResult` can be read from the matrix job as `"success"` by a job, marked as `"failed"` by another job, and then marked as `"success"` by other jobs. To my knowledge, the race condition has not been observed in a real-world case, but has been reproduced in a unit test. ``` ================== WARNING: DATA RACE Read at 0x00c0006d08a0 by goroutine 29232: code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult() /.../forgejo-runner/act/runner/job_executor.go:173 +0x359 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6() /.../forgejo-runner/act/runner/job_executor.go:118 +0x15d code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14() /.../forgejo-runner/act/common/executor.go:183 +0x86 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7() /.../forgejo-runner/act/runner/job_executor.go:161 +0x191 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16() /.../forgejo-runner/act/common/executor.go:183 +0x86 ... Previous write at 0x00c0006d08a0 by goroutine 29234: code.forgejo.org/forgejo/runner/v9/act/runner.(*RunContext).result() /.../forgejo-runner/act/runner/run_context.go:897 +0x271 code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult() /.../forgejo-runner/act/runner/job_executor.go:181 +0x66e code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6() /.../forgejo-runner/act/runner/job_executor.go:118 +0x15d code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14() /.../forgejo-runner/act/common/executor.go:183 +0x86 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7() /.../forgejo-runner/act/runner/job_executor.go:161 +0x191 code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16() /.../forgejo-runner/act/common/executor.go:183 +0x86 ... ================== ``` <!--start release-notes-assistant--> <!--URL:https://code.forgejo.org/forgejo/runner--> - bug fixes - [PR](https://code.forgejo.org/forgejo/runner/pulls/862): <!--number 862 --><!--line 0 --><!--description Zml4OiByYWNlIGNvbmRpdGlvbiBpbiBtYXRyaXggam9iIHJlc3VsdCBzdGF0ZSBtYXkgcmVzdWx0IGluIGZhaWxlZCBqb2JzIGJlaW5nIG1hcmtlZCBhcyBzdWNjZXNzZnVs-->fix: race condition in matrix job result state may result in failed jobs being marked as successful<!--description--> <!--end release-notes-assistant--> Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/862 Reviewed-by: earl-warren <earl-warren@noreply.code.forgejo.org> Co-authored-by: Mathieu Fenniak <mathieu@fenniak.net> Co-committed-by: Mathieu Fenniak <mathieu@fenniak.net>
This commit is contained in:
parent
03fbeab01b
commit
69a3adad21
3 changed files with 82 additions and 1 deletions
|
@ -11,6 +11,7 @@ import (
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"code.forgejo.org/forgejo/runner/v9/act/common"
|
"code.forgejo.org/forgejo/runner/v9/act/common"
|
||||||
"code.forgejo.org/forgejo/runner/v9/act/schema"
|
"code.forgejo.org/forgejo/runner/v9/act/schema"
|
||||||
|
@ -217,7 +218,9 @@ type Job struct {
|
||||||
Uses string `yaml:"uses"`
|
Uses string `yaml:"uses"`
|
||||||
With map[string]any `yaml:"with"`
|
With map[string]any `yaml:"with"`
|
||||||
RawSecrets yaml.Node `yaml:"secrets"`
|
RawSecrets yaml.Node `yaml:"secrets"`
|
||||||
Result string
|
|
||||||
|
Result string
|
||||||
|
ResultMutex sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strategy for the job
|
// Strategy for the job
|
||||||
|
|
|
@ -162,6 +162,11 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
|
||||||
func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success bool) {
|
func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success bool) {
|
||||||
logger := common.Logger(ctx)
|
logger := common.Logger(ctx)
|
||||||
|
|
||||||
|
// As we're reading the matrix build's status (`rc.Run.Job().Result`), it's possible for it change in another
|
||||||
|
// goroutine running `setJobResult` and invoking `.result(...)`. Prevent concurrent execution of `setJobResult`...
|
||||||
|
rc.Run.Job().ResultMutex.Lock()
|
||||||
|
defer rc.Run.Job().ResultMutex.Unlock()
|
||||||
|
|
||||||
jobResult := "success"
|
jobResult := "success"
|
||||||
// we have only one result for a whole matrix build, so we need
|
// we have only one result for a whole matrix build, so we need
|
||||||
// to keep an existing result state if we run a matrix
|
// to keep an existing result state if we run a matrix
|
||||||
|
|
|
@ -5,7 +5,9 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"code.forgejo.org/forgejo/runner/v9/act/common"
|
"code.forgejo.org/forgejo/runner/v9/act/common"
|
||||||
"code.forgejo.org/forgejo/runner/v9/act/container"
|
"code.forgejo.org/forgejo/runner/v9/act/container"
|
||||||
|
@ -333,3 +335,74 @@ func TestJobExecutorNewJobExecutor(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSetJobResultConcurrency(t *testing.T) {
|
||||||
|
jim := &jobInfoMock{}
|
||||||
|
job := model.Job{
|
||||||
|
Result: "success",
|
||||||
|
}
|
||||||
|
// Distinct RunContext objects are used to replicate realistic setJobResult in matrix build
|
||||||
|
rc1 := &RunContext{
|
||||||
|
Run: &model.Run{
|
||||||
|
JobID: "test",
|
||||||
|
Workflow: &model.Workflow{
|
||||||
|
Jobs: map[string]*model.Job{
|
||||||
|
"test": &job,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
rc2 := &RunContext{
|
||||||
|
Run: &model.Run{
|
||||||
|
JobID: "test",
|
||||||
|
Workflow: &model.Workflow{
|
||||||
|
Jobs: map[string]*model.Job{
|
||||||
|
"test": &job,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
jim.On("matrix").Return(map[string]interface{}{
|
||||||
|
"python": []string{"3.10", "3.11", "3.12"},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Synthesize a race condition in setJobResult where, by reading data from the job matrix earlier and then
|
||||||
|
// performing unsynchronzied writes to the same shared data structure, it can overwrite a failure status.
|
||||||
|
//
|
||||||
|
// Goroutine 1: Start marking job as success
|
||||||
|
// (artificially suspended
|
||||||
|
// by result() mock)
|
||||||
|
// Goroutine 2: Mark job as failure
|
||||||
|
// Goroutine 1: Finish marking job as success
|
||||||
|
//
|
||||||
|
// Correct behavior: Job is marked as a failure
|
||||||
|
// Bug behavior: Job is marked as a success
|
||||||
|
|
||||||
|
var lastResult string
|
||||||
|
jim.On("result", mock.Anything).Run(func(args mock.Arguments) {
|
||||||
|
result := args.String(0)
|
||||||
|
// Artificially suspend the "success" case so that the failure case races past it.
|
||||||
|
if result == "success" {
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
}
|
||||||
|
job.Result = result
|
||||||
|
lastResult = result
|
||||||
|
})
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(2)
|
||||||
|
// Goroutine 1, mark as success:
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
setJobResult(t.Context(), jim, rc1, true)
|
||||||
|
}()
|
||||||
|
// Goroutine 2, mark as failure:
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
setJobResult(t.Context(), jim, rc2, false)
|
||||||
|
}()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
assert.Equal(t, "failure", lastResult)
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue