feat: forgejo-runner-service.sh safeguard to avoid overheating

re-creating the LXC container from scratch is expensive. When rebooting or when multiple units start at the same time, it may cause an execessive load. Use a global lock to guard this operation so they happen in sequence and not in parallel. They typically take around one minute to complete which means that in the event of a reboot, it will take around 1 minutes * the number of runners for the unlucky one to start. During this interval workflows will have to wait.
2025-06-27 16:35:58 +00:00 · 2025-01-27 14:17:15 +01:00 · 2025-01-27 14:17:15 +01:00 · b79578d98e
commit b79578d98e
parent 0232fe1255
1 changed files with 11 additions and 1 deletions
--- a/examples/lxc-systemd/forgejo-runner-service.sh
+++ b/examples/lxc-systemd/forgejo-runner-service.sh
@ -31,6 +31,7 @@ SELF_FILENAME=$(basename "$SELF")
 ETC=/etc/forgejo-runner
 LIB=/var/lib/forgejo-runner
 LOG=/var/log/forgejo-runner
+LOCK=/var/lock/forgejo-runner
 : ${HOST:=$(hostname)}

 LXC_IPV4_PREFIX="10.105.7"
@ -253,10 +254,19 @@ function daemon() {
  set -e
 }

-function start() {
+function destroy_and_create() {
  stop
  lxc-helpers.sh lxc_container_destroy $(lxc_name)
  lxc_create
+}
+
+function start() {
+  # it should be more than
+  # (time it takes for one runner to be recreated) * (number of runners)
+  # because they will all start at the same time on boot
+  local timeout=3600
+
+  flock --timeout $timeout $LOCK $SELF destroy_and_create

  local log=$LOG/$INPUTS_SERIAL.log
  if test -f $log; then