Issue a smart-reconfigure after (re-)creating zuul

Zuul no longer automatically performs a smart-reconfigure on startup so we need to do that ourselves in case the tenant config has changed. There's a minor race window after the zuul CR spec changes where the statefulset of the scheduler has not rolled out. We have to wait for it to complete before calling smart-reconfigure, or we risk running it on pods scheduled for deletion. Also adding a fix from: https://review.opendev.org/c/zuul/zuul-operator/+/861279 This is needed to get exec in pods to work. Change-Id: Ib35e85ed7666c2eb322971302f7f0d94a28bfa1f Co-Authored-By: Jan Gutter <github@jangutter.com> Co-Authored-By: Michal Nasiadka <mnasiadka@gmail.com> Co-Authored-By: Michael Kelly <mkelly@arista.com>
2025-07-09 07:21:54 -07:00
parent 1c627c53c5
commit a9858a5b77
4 changed files with 30 additions and 3 deletions
--- a/playbooks/zuul-operator-functional/test.yaml
+++ b/playbooks/zuul-operator-functional/test.yaml
@@ -161,7 +161,7 @@
        var: console_stream

    - name: fail if console stream does not contains expected job output
-      when: "'Job console starting...' not in console_stream.stdout"
+      when: "'Job console starting' not in console_stream.stdout"
      # It seems like wsdump.py doesn't always stay connected for the whole job duration
      # when: "'Demo job is running' not in console_stream.stdout"
      fail:
--- a/zuul_operator/operator.py
+++ b/zuul_operator/operator.py
@@ -174,6 +174,11 @@ def update_fn(name, namespace, logger, old, new, memo, **kwargs):
    if spec_changed:
        zuul.create_zuul()

+    if conf_changed:
+        if spec_changed:
+            zuul.wait_for_statefulset('zuul-scheduler')
+        zuul.smart_reconfigure()
+
    memoize_secrets(memo, logger)


--- a/zuul_operator/utils.py
+++ b/zuul_operator/utils.py
@@ -19,7 +19,6 @@ import string
 import kopf
 import yaml
 import jinja2
-import kubernetes
 from kubernetes.client import Configuration
 from kubernetes.client.api import core_v1_api
 from kubernetes.stream import stream
@@ -82,7 +81,6 @@ def update_secret(api, namespace, name, string_data):


 def pod_exec(namespace, name, command):
-    kubernetes.config.load_kube_config()
    try:
        c = Configuration().get_default_copy()
    except AttributeError:
--- a/zuul_operator/zuul.py
+++ b/zuul_operator/zuul.py
@@ -16,6 +16,7 @@ import kopf
 import copy
 import base64
 import hashlib
+import time

 import jinja2
 import pykube
@@ -410,6 +411,29 @@ class Zuul:
        utils.apply_file(self.api, 'zuul.yaml', namespace=self.namespace, **kw)
        self.create_nodepool()

+    def wait_for_statefulset(self, set_name, tries=6, delay=10):
+        self.log.info("Waiting for StatefulSet %s to finish rollout", set_name)
+        for _ in range(tries):
+            scheduler_set = objects.StatefulSet.objects(self.api).filter(
+                namespace=self.namespace,
+                selector={'app.kubernetes.io/instance': self.name,
+                          'app.kubernetes.io/component': set_name,
+                          'app.kubernetes.io/name': 'zuul',
+                          'app.kubernetes.io/part-of': 'zuul'}).get(
+                              name=set_name)
+            spec = scheduler_set.obj['spec']
+            status = scheduler_set.obj['status']
+            if (spec['replicas'] == status.get('replicas', None) and
+                spec['replicas'] == status.get('currentReplicas', None) and
+                spec['replicas'] == status.get('readyReplicas', None) and
+                (status.get('updateRevision', None) ==
+                 status.get('currentRevision', None))):
+                self.log.info("StatefulSet %s completed rollout", set_name)
+                return
+            time.sleep(delay)
+        self.log.error("StatefulSet did not finish rollout after %d seconds",
+                       tries * delay)
+
    def smart_reconfigure(self):
        self.log.info("Smart reconfigure")
        try: