From b79578d98eba7080c396aa64cb99435beca38b06 Mon Sep 17 00:00:00 2001 From: Earl Warren Date: Mon, 27 Jan 2025 14:17:15 +0100 Subject: [PATCH] feat: forgejo-runner-service.sh safeguard to avoid overheating re-creating the LXC container from scratch is expensive. When rebooting or when multiple units start at the same time, it may cause an execessive load. Use a global lock to guard this operation so they happen in sequence and not in parallel. They typically take around one minute to complete which means that in the event of a reboot, it will take around 1 minutes * the number of runners for the unlucky one to start. During this interval workflows will have to wait. --- examples/lxc-systemd/forgejo-runner-service.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/lxc-systemd/forgejo-runner-service.sh b/examples/lxc-systemd/forgejo-runner-service.sh index 8bdc73df..2ff51dc1 100755 --- a/examples/lxc-systemd/forgejo-runner-service.sh +++ b/examples/lxc-systemd/forgejo-runner-service.sh @@ -31,6 +31,7 @@ SELF_FILENAME=$(basename "$SELF") ETC=/etc/forgejo-runner LIB=/var/lib/forgejo-runner LOG=/var/log/forgejo-runner +LOCK=/var/lock/forgejo-runner : ${HOST:=$(hostname)} LXC_IPV4_PREFIX="10.105.7" @@ -253,10 +254,19 @@ function daemon() { set -e } -function start() { +function destroy_and_create() { stop lxc-helpers.sh lxc_container_destroy $(lxc_name) lxc_create +} + +function start() { + # it should be more than + # (time it takes for one runner to be recreated) * (number of runners) + # because they will all start at the same time on boot + local timeout=3600 + + flock --timeout $timeout $LOCK $SELF destroy_and_create local log=$LOG/$INPUTS_SERIAL.log if test -f $log; then