From a9858a5b77b7c2b2219950bc457de1d79d3cd7b5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 9 Jul 2025 07:21:54 -0700 Subject: [PATCH] Issue a smart-reconfigure after (re-)creating zuul Zuul no longer automatically performs a smart-reconfigure on startup so we need to do that ourselves in case the tenant config has changed. There's a minor race window after the zuul CR spec changes where the statefulset of the scheduler has not rolled out. We have to wait for it to complete before calling smart-reconfigure, or we risk running it on pods scheduled for deletion. Also adding a fix from: https://review.opendev.org/c/zuul/zuul-operator/+/861279 This is needed to get exec in pods to work. Change-Id: Ib35e85ed7666c2eb322971302f7f0d94a28bfa1f Co-Authored-By: Jan Gutter Co-Authored-By: Michal Nasiadka Co-Authored-By: Michael Kelly --- playbooks/zuul-operator-functional/test.yaml | 2 +- zuul_operator/operator.py | 5 ++++ zuul_operator/utils.py | 2 -- zuul_operator/zuul.py | 24 ++++++++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/playbooks/zuul-operator-functional/test.yaml b/playbooks/zuul-operator-functional/test.yaml index d21de87..73bd925 100644 --- a/playbooks/zuul-operator-functional/test.yaml +++ b/playbooks/zuul-operator-functional/test.yaml @@ -161,7 +161,7 @@ var: console_stream - name: fail if console stream does not contains expected job output - when: "'Job console starting...' not in console_stream.stdout" + when: "'Job console starting' not in console_stream.stdout" # It seems like wsdump.py doesn't always stay connected for the whole job duration # when: "'Demo job is running' not in console_stream.stdout" fail: diff --git a/zuul_operator/operator.py b/zuul_operator/operator.py index 8fe874e..6b32611 100644 --- a/zuul_operator/operator.py +++ b/zuul_operator/operator.py @@ -174,6 +174,11 @@ def update_fn(name, namespace, logger, old, new, memo, **kwargs): if spec_changed: zuul.create_zuul() + if conf_changed: + if spec_changed: + zuul.wait_for_statefulset('zuul-scheduler') + zuul.smart_reconfigure() + memoize_secrets(memo, logger) diff --git a/zuul_operator/utils.py b/zuul_operator/utils.py index ee547a5..07622ed 100644 --- a/zuul_operator/utils.py +++ b/zuul_operator/utils.py @@ -19,7 +19,6 @@ import string import kopf import yaml import jinja2 -import kubernetes from kubernetes.client import Configuration from kubernetes.client.api import core_v1_api from kubernetes.stream import stream @@ -82,7 +81,6 @@ def update_secret(api, namespace, name, string_data): def pod_exec(namespace, name, command): - kubernetes.config.load_kube_config() try: c = Configuration().get_default_copy() except AttributeError: diff --git a/zuul_operator/zuul.py b/zuul_operator/zuul.py index b2622c6..3a1ca2f 100644 --- a/zuul_operator/zuul.py +++ b/zuul_operator/zuul.py @@ -16,6 +16,7 @@ import kopf import copy import base64 import hashlib +import time import jinja2 import pykube @@ -410,6 +411,29 @@ class Zuul: utils.apply_file(self.api, 'zuul.yaml', namespace=self.namespace, **kw) self.create_nodepool() + def wait_for_statefulset(self, set_name, tries=6, delay=10): + self.log.info("Waiting for StatefulSet %s to finish rollout", set_name) + for _ in range(tries): + scheduler_set = objects.StatefulSet.objects(self.api).filter( + namespace=self.namespace, + selector={'app.kubernetes.io/instance': self.name, + 'app.kubernetes.io/component': set_name, + 'app.kubernetes.io/name': 'zuul', + 'app.kubernetes.io/part-of': 'zuul'}).get( + name=set_name) + spec = scheduler_set.obj['spec'] + status = scheduler_set.obj['status'] + if (spec['replicas'] == status.get('replicas', None) and + spec['replicas'] == status.get('currentReplicas', None) and + spec['replicas'] == status.get('readyReplicas', None) and + (status.get('updateRevision', None) == + status.get('currentRevision', None))): + self.log.info("StatefulSet %s completed rollout", set_name) + return + time.sleep(delay) + self.log.error("StatefulSet did not finish rollout after %d seconds", + tries * delay) + def smart_reconfigure(self): self.log.info("Smart reconfigure") try: