Improve tempest alert rules

Alert if results not seen for some time (if an expected test run was missed). Keep the failed test alert firing until a test run succeeds (this alert would resolve itself after 5min, which was undesirable). Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
2024-06-11 17:14:33 +09:30
parent 59afdcc479
commit 39cb6917bb
9 changed files with 196 additions and 58 deletions
--- a/charms/tempest-k8s/.gitignore
+++ b/charms/tempest-k8s/.gitignore
@@ -0,0 +1,2 @@
 # alert rules for this charm are dynamically written to disk
 /src/loki_alert_rules/*.rules
--- a/charms/tempest-k8s/src/charm.py
+++ b/charms/tempest-k8s/src/charm.py
@@ -51,6 +51,10 @@ from ops.model import (
 from ops_sunbeam.config_contexts import (
    ConfigContext,
 )
 from utils.alert_rules import (
    ensure_alert_rules_disabled,
    update_alert_rules_files,
 )
 from utils.cleanup import (
    CleanUpError,
    run_extensive_cleanup,
@@ -73,9 +77,12 @@ from utils.types import (
    TempestEnvVariant,
 )
 from utils.validators import (
    Schedule,
    validated_schedule,
 )
 LOKI_RELATION_NAME = "logging"
 logger = logging.getLogger(__name__)
@@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
    def context(self) -> dict:
        """Tempest context."""
        return {
-            "schedule": self.charm.get_schedule(),
+            "schedule": (
                self.charm.get_schedule().value
                if self.charm.is_schedule_ready()
                else ""
            ),
        }
@@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
            ),
        ]
-    def get_schedule(self) -> str:
+    def get_schedule(self) -> Schedule:
-        """Return the schedule option if valid and should be enabled.
+        """Validate and return the schedule from config."""
        return validated_schedule(self.config["schedule"])
-        If the schedule option is invalid,
+    def is_schedule_ready(self) -> bool:
-        or periodic checks shouldn't currently be enabled
+        """Check if the schedule is valid and periodic tests should be enabled.
-        (eg. observability relations not ready),
+
-        then return an empty schedule string.
+        Return True if the schedule config option is valid,
-        An empty string disables the schedule.
+        and pre-requisites for periodic checks are ready.
        """
-        schedule = validated_schedule(self.config["schedule"])
+        schedule = self.get_schedule()
-        if not schedule.valid:
+        return (
-            return ""
+            schedule.valid
-
+            and schedule.value
-        # if tempest env isn't ready,
+            and self.is_tempest_ready()
        # or if the logging relation isn't joined,
        # or if keystone isn't ready,
        # then we can't start scheduling periodic tests
        if not (
            self.is_tempest_ready()
            and self.loki.ready
            and self.user_id_ops.ready
-        ):
+        )
            return ""
        return schedule.value
    @property
    def config_contexts(self) -> List[ConfigContext]:
@@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
        handlers.append(self.user_id_ops)
        self.loki = LoggingRelationHandler(
            self,
-            "logging",
+            LOKI_RELATION_NAME,
            self.configure_charm,
            mandatory="logging" in self.mandatory_relations,
        )
@@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
        logger.info("Configuring the tempest environment")
-        schedule = validated_schedule(self.config["schedule"])
+        schedule = self.get_schedule()
        if not schedule.valid:
            raise sunbeam_guard.BlockedExceptionError(
                f"invalid schedule config: {schedule.err}"
@@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
                "tempest init failed, see logs for more info"
            )
        # Ensure the alert rules are in sync with charm config.
        if self.is_schedule_ready():
            update_alert_rules_files(schedule)
        else:
            ensure_alert_rules_disabled()
        if self.loki.ready:
            for relation in self.model.relations[LOKI_RELATION_NAME]:
                self.loki.interface._handle_alert_rules(relation)
        self.status.set(ActiveStatus(""))
        logger.info("Finished configuring the tempest environment")
--- a/charms/tempest-k8s/src/handlers.py
+++ b/charms/tempest-k8s/src/handlers.py
@@ -37,6 +37,9 @@ import ops.model
 import ops.pebble
 import ops_sunbeam.container_handlers as sunbeam_chandlers
 import ops_sunbeam.relation_handlers as sunbeam_rhandlers
 from utils.alert_rules import (
    ALERT_RULES_PATH,
 )
 from utils.cleanup import (
    CleanUpError,
    run_extensive_cleanup,
@@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
            self.charm,
            recursive=True,
            relation_name=self.relation_name,
-            alert_rules_path="src/loki_alert_rules",
+            alert_rules_path=ALERT_RULES_PATH,
            logs_scheme={
                "tempest": {
                    "log-files": [
--- a/charms/tempest-k8s/src/loki_alert_rules/.gitkeep
+++ b/charms/tempest-k8s/src/loki_alert_rules/.gitkeep
--- a/charms/tempest-k8s/src/loki_alert_rules/tests.rules
+++ b/charms/tempest-k8s/src/loki_alert_rules/tests.rules
@@ -1,10 +0,0 @@
 groups:
 - name: tempest-failed-tests
  rules:
  - alert: FailedTests
    expr: |
      sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
    labels:
      severity: high
    annotations:
      summary: "Failed tests: {{ $value }}!"
--- a/charms/tempest-k8s/src/utils/alert_rules.py
+++ b/charms/tempest-k8s/src/utils/alert_rules.py
@@ -0,0 +1,89 @@
 # Copyright 2024 Canonical Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Working with the loki logging alert rules."""
 import os
 from math import (
    ceil,
 )
 import yaml
 from utils.validators import (
    Schedule,
 )
 ALERT_RULES_PATH = "src/loki_alert_rules"
 ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
 # The default for max_query_length in Loki is now 721h,
 # and thus the value in Loki deployed by COS.
 # ref. https://github.com/grafana/loki/issues/4509
 # We need a small buffer to make it work in these queries.
 MAX_RANGE_HOURS = 719
 def ensure_alert_rules_disabled():
    """Ensure the alert rules files don't exist."""
    try:
        os.remove(ALERT_RULES_FILE)
    except FileNotFoundError:
        pass
    return
 def update_alert_rules_files(schedule: Schedule) -> None:
    """Update files for alert rules based on the schedule.
    `schedule` is expected to be a valid and ready Schedule.
    """
    absent_range_hours = min(
        # Convert seconds to hours,
        # round up to avoid a range of 0,
        # and double the interval to ensure it only alerts when one was definitely missed.
        ceil(schedule.max_interval / 60 / 60) * 2,
        # Ensure that the log query limit isn't exceeded
        MAX_RANGE_HOURS,
    )
    rules = {
        "groups": [
            {
                "name": "tempest-failed-tests",
                "rules": [
                    {
                        "alert": "FailedTests",
                        "expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
                        "labels": {
                            "severity": "high",
                        },
                        "annotations": {
                            "summary": "Tempest periodic tests failed.",
                        },
                    },
                    {
                        "alert": "AbsentTests",
                        "expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
                        "labels": {
                            "severity": "high",
                        },
                        "annotations": {
                            "summary": "Tempest periodic tests were not run on schedule.",
                        },
                    },
                ],
            }
        ]
    }
    with open(ALERT_RULES_FILE, "w") as f:
        yaml.safe_dump(rules, f)
--- a/charms/tempest-k8s/src/utils/validators.py
+++ b/charms/tempest-k8s/src/utils/validators.py
@@ -18,25 +18,37 @@ from dataclasses import (
 from datetime import (
    datetime,
 )
 from functools import (
    lru_cache,
 )
 from croniter import (
    CroniterBadDateError,
    croniter,
 )
-@dataclass
+@dataclass(frozen=True)
 class Schedule:
    """A cron schedule that has validation information."""
    value: str
    valid: bool
    err: str
    # in validation, these are the maximum and minimum intervals between runs seen
    max_interval: int = 0  # in seconds
    min_interval: int = 0  # in seconds
@lru_cache
 def validated_schedule(schedule: str) -> Schedule:
    """Process and validate a schedule str.
    Return the schedule with validation info.
    Part of validation includes sampling a range of matches
    for the cron schedule.  This can be time consuming,
    so this function is cached to avoid repeating work.
    """
    # Empty schedule is fine; it means it's disabled in this context.
    if not schedule:
@@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
            )
        return Schedule(value=schedule, valid=False, err=msg)
-    # This is a rather naive method for enforcing this,
+    # This is a heuristic method of checking because cron schedules aren't regular,
    # and it may be possible to craft an expression
    # that results in some consecutive runs within 15 minutes,
    # however this is fine, as there is process locking for tempest,
    # and this is more of a sanity check than a security requirement.
-    t1 = cron.get_next()
+    intervals = []  # in seconds
-    t2 = cron.get_next()
+    try:
-    if t2 - t1 < 15 * 60:  # 15 minutes in seconds
+        last = cron.get_next()
        for _ in range(5):
            next_ = cron.get_next()
            intervals.append(next_ - last)
            last = next_
    except CroniterBadDateError:
        return Schedule(
            value=schedule,
            valid=False,
            err=(
                "Could not calculate a range of values from the schedule; "
                "please check the schedule or try a shorter schedule period."
            ),
        )
    if min(intervals) < 15 * 60:  # 15 minutes in seconds
        return Schedule(
            value=schedule,
            valid=False,
            err="Cannot schedule periodic check to run faster than every 15 minutes.",
        )
-    return Schedule(value=schedule, valid=True, err="")
+    return Schedule(
        value=schedule,
        valid=True,
        err="",
        max_interval=max(intervals),
        min_interval=min(intervals),
    )
--- a/charms/tempest-k8s/tests/unit/test_tempest_charm.py
+++ b/charms/tempest-k8s/tests/unit/test_tempest_charm.py
@@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.patch_obj(
            utils.cleanup, "_get_exclusion_resources"
        ).return_value = {"projects": set(), "users": set()}
        # We must keep a reference to the patcher object,
        # because in a couple of tests we need to not patch this.
        # self.patch_obj doesn't give us a reference to the patcher.
        self.get_unit_data_patcher = patch.object(
            charm.TempestOperatorCharm,
            "get_unit_data",
            Mock(return_value="true"),
        )
        self.get_unit_data_patcher.start()
        self.addCleanup(self.get_unit_data_patcher.stop)
    def add_identity_ops_relation(self, harness):
        """Add identity resource relation."""
@@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.harness.update_config({"schedule": "*/21 * * * *"})
-        self.harness.charm.set_tempest_ready.has_calls(
+        self.harness.charm.set_tempest_ready.assert_has_calls(
            [call(False), call(False)]
        )
        self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
@@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.add_identity_ops_relation(self.harness)
        self.add_grafana_dashboard_relation(self.harness)
-        # We want the real get_unit_data method here,
+        # simulate tempest ready
        # because its logic is being tested.
        self.get_unit_data_patcher.stop()
        self.harness.charm.peers = Mock()
        self.harness.charm.peers.interface.peers_rel.data = MagicMock()
        self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
@@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
        self.add_identity_ops_relation(self.harness)
        self.add_grafana_dashboard_relation(self.harness)
-        # We want the real get_unit_data method here,
+        # simulate tempest not ready
        # because its logic is being tested.
        self.get_unit_data_patcher.stop()
        self.harness.charm.peers = Mock()
        self.harness.charm.peers.interface.peers_rel.data = MagicMock()
        self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
--- a/charms/tempest-k8s/tests/unit/test_validators.py
+++ b/charms/tempest-k8s/tests/unit/test_validators.py
@@ -17,6 +17,9 @@
 """Unit tests for Tempest validator utility functions."""
 import unittest
 from dataclasses import (
    FrozenInstanceError,
 )
 from utils.validators import (
    validated_schedule,
@@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
        self.assertFalse(schedule.valid)
        self.assertIn("not acceptable", schedule.err)
        self.assertEqual(schedule.value, exp)
    def test_expression_too_sparse(self):
        """Verify an expression with a very long period is caught."""
        exp = "0 4 30 2 *"  # on february 30  ;)
        schedule = validated_schedule(exp)
        self.assertFalse(schedule.valid)
        self.assertIn("not calculate a range", schedule.err)
        self.assertEqual(schedule.value, exp)
    def test_schedule_type_is_immutable(self):
        """Schedule should be immutable."""
        # this is both to avoid issues with caching it,
        # and to ensure a validated schedule is not accidentally modified
        # (it should not be modified because then it may not be valid any more)
        schedule = validated_schedule("5 4 * * *")
        self.assertTrue(schedule.valid)
        with self.assertRaises(FrozenInstanceError):
            schedule.valid = False
		`@@ -0,0 +1,2 @@`
							`# alert rules for this charm are dynamically written to disk`
							`/src/loki_alert_rules/*.rules`