From 03987a8009fb874bcbf1167f85f5df2304a5556a Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Thu, 16 Jun 2022 08:41:44 -0700 Subject: [PATCH] Run zuul cluster reboots and updates automatically This adds a weekly cronjob that will reboot and update our entire zuul cluster gracefully. The time frame chosen for this should be after North America begins its weekend and before Europe starts their week. The idea is that we're doing this during the quiet time of our week. Change-Id: Ib9a54f273e11744fa1ddbf367c291289f86bddb7 --- playbooks/service-bridge.yaml | 20 ++++++++++++++++++++ playbooks/zuul_reboot.yaml | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/playbooks/service-bridge.yaml b/playbooks/service-bridge.yaml index 164d924773..7f84396435 100644 --- a/playbooks/service-bridge.yaml +++ b/playbooks/service-bridge.yaml @@ -39,3 +39,23 @@ state: directory owner: root mode: 0755 + + - name: Automated Zuul cluster reboots and updates + # Note this is run via cron because a zuul job can't run this playbook + # as the playbook relies on all jobs ending for graceful stops on the + # executors. + cron: + name: "Zuul cluster restart" + # Start Sundays at 00:01 UTC. + # Estimated completion time Sunday at 18:00 UTC. + minute: 1 + hour: 0 + weekday: 6 + job: "flock -n /var/run/ansible/zuul_reboot.lock /usr/local/bin/ansible-playbook -f 20 /home/zuul/src/opendev.org/opendev/system-config/playbooks/zuul_reboot.yaml > /var/log/ansible/zuul_reboot.log 2>&1" + + - name: Rotate Zuul restart logs + include_role: + name: logrotate + vars: + logrotate_file_name: /var/log/ansible/zuul_reboot.log + logrotate_frequency: weekly diff --git a/playbooks/zuul_reboot.yaml b/playbooks/zuul_reboot.yaml index 1cb00b5a58..e0b7fed147 100644 --- a/playbooks/zuul_reboot.yaml +++ b/playbooks/zuul_reboot.yaml @@ -1,4 +1,5 @@ -# TODO We need to add a locking/failsafe check mechanism +# This relies on flock -n /var/run/ansible/zuul_reboot.lock to ensure +# we don't run multiple copies of this playbook concurrently. # TODO: stop pulling in the hourly job if we do this - name: "Ensure we are going to restart/reboot on the same image"