From b46c03d75a1e86dc619c408d313b97863547ed6e Mon Sep 17 00:00:00 2001 From: Mathieu Fenniak Date: Tue, 5 Aug 2025 14:58:53 -0600 Subject: [PATCH] wait as long as required for health check configuration timeout when context is cancelled, rather than fixed iteration count --- act/container/container_types.go | 4 ++- act/container/docker_run.go | 41 +++++++++++++++++++++++++------ act/container/host_environment.go | 8 ++++-- act/runner/run_context.go | 33 ++++++++++++++++++------- 4 files changed, 66 insertions(+), 20 deletions(-) diff --git a/act/container/container_types.go b/act/container/container_types.go index 0dea315a..4790d852 100644 --- a/act/container/container_types.go +++ b/act/container/container_types.go @@ -3,6 +3,7 @@ package container import ( "context" "io" + "time" "code.forgejo.org/forgejo/runner/v9/act/common" "github.com/docker/go-connections/nat" @@ -63,7 +64,8 @@ type Container interface { Remove() common.Executor Close() common.Executor ReplaceLogWriter(io.Writer, io.Writer) (io.Writer, io.Writer) - GetHealth(ctx context.Context) Health + GetHealth(ctx context.Context) (Health, error) + GetHealthCheckTimeout(ctx context.Context) (*time.Duration, error) } // NewDockerBuildExecutorInput the input for the NewDockerBuildExecutor function diff --git a/act/container/docker_run.go b/act/container/docker_run.go index 4d3073c8..9ed50a69 100644 --- a/act/container/docker_run.go +++ b/act/container/docker_run.go @@ -15,6 +15,7 @@ import ( "runtime" "strconv" "strings" + "time" "github.com/Masterminds/semver" "github.com/docker/cli/cli/compose/loader" @@ -191,28 +192,52 @@ func (cr *containerReference) Remove() common.Executor { ).IfNot(common.Dryrun) } -func (cr *containerReference) GetHealth(ctx context.Context) Health { +func (cr *containerReference) GetHealth(ctx context.Context) (Health, error) { resp, err := cr.cli.ContainerInspect(ctx, cr.id) logger := common.Logger(ctx) if err != nil { - logger.Errorf("failed to query container health %s", err) - return HealthUnHealthy + return HealthUnHealthy, err } if resp.Config == nil || resp.Config.Healthcheck == nil || resp.State == nil || resp.State.Health == nil || len(resp.Config.Healthcheck.Test) == 1 && strings.EqualFold(resp.Config.Healthcheck.Test[0], "NONE") { logger.Debugf("no container health check defined") - return HealthHealthy + return HealthHealthy, nil } logger.Infof("container health of %s (%s) is %s", cr.id, resp.Config.Image, resp.State.Health.Status) switch resp.State.Health.Status { case "starting": - return HealthStarting + return HealthStarting, nil case "healthy": - return HealthHealthy + return HealthHealthy, nil case "unhealthy": - return HealthUnHealthy + return HealthUnHealthy, nil } - return HealthUnHealthy + return HealthUnHealthy, fmt.Errorf("unrecognized health state: %v", resp.State.Health.Status) +} + +func (cr *containerReference) GetHealthCheckTimeout(ctx context.Context) (*time.Duration, error) { + resp, err := cr.cli.ContainerInspect(ctx, cr.id) + if err != nil { + return nil, err + } + if resp.Config == nil || resp.Config.Healthcheck == nil || len(resp.Config.Healthcheck.Test) == 1 && strings.EqualFold(resp.Config.Healthcheck.Test[0], "NONE") { + return nil, nil + } + + retries := time.Duration(resp.Config.Healthcheck.Retries) + // Prefer using `--health-start-interval` option since we're using this timing for service container startup, but + // fallback to `--health-interval` if it isn't defined. + interval := resp.Config.Healthcheck.StartInterval + if interval == 0 { + interval = resp.Config.Healthcheck.Interval + } + + // Docker will run one health check, with a maximum cmd time of Timeout, every StartInterval, up to the number of + // Retries, after an initial pause of StartPeriod. Therefore the pessimistic time that we would wait is... + maxWait := resp.Config.Healthcheck.StartPeriod + + (retries * resp.Config.Healthcheck.Timeout) + + (retries * interval) + return &maxWait, nil } func (cr *containerReference) ReplaceLogWriter(stdout, stderr io.Writer) (io.Writer, io.Writer) { diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 8f36049a..77fa917b 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -493,8 +493,12 @@ func (e *HostEnvironment) GetRunnerContext(_ context.Context) map[string]interfa } } -func (e *HostEnvironment) GetHealth(ctx context.Context) Health { - return HealthHealthy +func (e *HostEnvironment) GetHealth(ctx context.Context) (Health, error) { + return HealthHealthy, nil +} + +func (e *HostEnvironment) GetHealthCheckTimeout(ctx context.Context) (*time.Duration, error) { + return nil, nil } func (e *HostEnvironment) ReplaceLogWriter(stdout, _ io.Writer) (io.Writer, io.Writer) { diff --git a/act/runner/run_context.go b/act/runner/run_context.go index d3f14f00..b4c5866a 100644 --- a/act/runner/run_context.go +++ b/act/runner/run_context.go @@ -746,15 +746,34 @@ func (rc *RunContext) startServiceContainers(_ string) common.Executor { } func (rc *RunContext) waitForServiceContainer(c container.ExecutionsEnvironment) common.Executor { + // FIXME: GetName() is definitely 'wrong' because it just returns "NAME". :-p + return func(ctx context.Context) error { - sctx, cancel := context.WithTimeout(ctx, time.Minute*5) + logger := common.Logger(ctx) + timeout, err := c.GetHealthCheckTimeout(ctx) + if err != nil { + return fmt.Errorf("service container %s could not detect health check timeout due to error, no health check wait will occur: %v", c.GetName(), err) + } else if timeout == nil { + logger.Debugf("service container %s had no health check", c.GetName()) + return nil + } + + ctx, cancel := context.WithTimeout(ctx, *timeout) defer cancel() - var health container.Health + delay := time.Second for i := 0; ; i++ { - health = c.GetHealth(sctx) - if health != container.HealthStarting || i > 30 { - break + health, err := c.GetHealth(ctx) + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("service container %s: timed out while waiting for healthy or unhealthy status to be reported", c.GetName()) + } else if errors.Is(err, context.Canceled) { + return err + } else if err != nil { + logger.Warnf("service container %s: error while checking for health state, will retry: %v", c.GetName(), err) + } else if health == container.HealthUnHealthy { + return fmt.Errorf("service container %s failed health check", c.GetName()) + } else if health == container.HealthHealthy { + return nil } time.Sleep(delay) delay *= 2 @@ -762,10 +781,6 @@ func (rc *RunContext) waitForServiceContainer(c container.ExecutionsEnvironment) delay = 10 * time.Second } } - if health == container.HealthHealthy { - return nil - } - return fmt.Errorf("service container failed to start") } }