Improve tempest alert rules

Alert if results not seen for some time
(if an expected test run was missed).

Keep the failed test alert firing until a test run succeeds
(this alert would resolve itself after 5min, which was undesirable).

Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
This commit is contained in:
Samuel Allan
2024-06-11 17:14:33 +09:30
parent 59afdcc479
commit 39cb6917bb
9 changed files with 196 additions and 58 deletions

2
charms/tempest-k8s/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
# alert rules for this charm are dynamically written to disk
/src/loki_alert_rules/*.rules

View File

@@ -51,6 +51,10 @@ from ops.model import (
from ops_sunbeam.config_contexts import ( from ops_sunbeam.config_contexts import (
ConfigContext, ConfigContext,
) )
from utils.alert_rules import (
ensure_alert_rules_disabled,
update_alert_rules_files,
)
from utils.cleanup import ( from utils.cleanup import (
CleanUpError, CleanUpError,
run_extensive_cleanup, run_extensive_cleanup,
@@ -73,9 +77,12 @@ from utils.types import (
TempestEnvVariant, TempestEnvVariant,
) )
from utils.validators import ( from utils.validators import (
Schedule,
validated_schedule, validated_schedule,
) )
LOKI_RELATION_NAME = "logging"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
def context(self) -> dict: def context(self) -> dict:
"""Tempest context.""" """Tempest context."""
return { return {
"schedule": self.charm.get_schedule(), "schedule": (
self.charm.get_schedule().value
if self.charm.is_schedule_ready()
else ""
),
} }
@@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
), ),
] ]
def get_schedule(self) -> str: def get_schedule(self) -> Schedule:
"""Return the schedule option if valid and should be enabled. """Validate and return the schedule from config."""
return validated_schedule(self.config["schedule"])
If the schedule option is invalid, def is_schedule_ready(self) -> bool:
or periodic checks shouldn't currently be enabled """Check if the schedule is valid and periodic tests should be enabled.
(eg. observability relations not ready),
then return an empty schedule string. Return True if the schedule config option is valid,
An empty string disables the schedule. and pre-requisites for periodic checks are ready.
""" """
schedule = validated_schedule(self.config["schedule"]) schedule = self.get_schedule()
if not schedule.valid: return (
return "" schedule.valid
and schedule.value
# if tempest env isn't ready, and self.is_tempest_ready()
# or if the logging relation isn't joined,
# or if keystone isn't ready,
# then we can't start scheduling periodic tests
if not (
self.is_tempest_ready()
and self.loki.ready and self.loki.ready
and self.user_id_ops.ready and self.user_id_ops.ready
): )
return ""
return schedule.value
@property @property
def config_contexts(self) -> List[ConfigContext]: def config_contexts(self) -> List[ConfigContext]:
@@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
handlers.append(self.user_id_ops) handlers.append(self.user_id_ops)
self.loki = LoggingRelationHandler( self.loki = LoggingRelationHandler(
self, self,
"logging", LOKI_RELATION_NAME,
self.configure_charm, self.configure_charm,
mandatory="logging" in self.mandatory_relations, mandatory="logging" in self.mandatory_relations,
) )
@@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
logger.info("Configuring the tempest environment") logger.info("Configuring the tempest environment")
schedule = validated_schedule(self.config["schedule"]) schedule = self.get_schedule()
if not schedule.valid: if not schedule.valid:
raise sunbeam_guard.BlockedExceptionError( raise sunbeam_guard.BlockedExceptionError(
f"invalid schedule config: {schedule.err}" f"invalid schedule config: {schedule.err}"
@@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
"tempest init failed, see logs for more info" "tempest init failed, see logs for more info"
) )
# Ensure the alert rules are in sync with charm config.
if self.is_schedule_ready():
update_alert_rules_files(schedule)
else:
ensure_alert_rules_disabled()
if self.loki.ready:
for relation in self.model.relations[LOKI_RELATION_NAME]:
self.loki.interface._handle_alert_rules(relation)
self.status.set(ActiveStatus("")) self.status.set(ActiveStatus(""))
logger.info("Finished configuring the tempest environment") logger.info("Finished configuring the tempest environment")

View File

@@ -37,6 +37,9 @@ import ops.model
import ops.pebble import ops.pebble
import ops_sunbeam.container_handlers as sunbeam_chandlers import ops_sunbeam.container_handlers as sunbeam_chandlers
import ops_sunbeam.relation_handlers as sunbeam_rhandlers import ops_sunbeam.relation_handlers as sunbeam_rhandlers
from utils.alert_rules import (
ALERT_RULES_PATH,
)
from utils.cleanup import ( from utils.cleanup import (
CleanUpError, CleanUpError,
run_extensive_cleanup, run_extensive_cleanup,
@@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
self.charm, self.charm,
recursive=True, recursive=True,
relation_name=self.relation_name, relation_name=self.relation_name,
alert_rules_path="src/loki_alert_rules", alert_rules_path=ALERT_RULES_PATH,
logs_scheme={ logs_scheme={
"tempest": { "tempest": {
"log-files": [ "log-files": [

View File

@@ -1,10 +0,0 @@
groups:
- name: tempest-failed-tests
rules:
- alert: FailedTests
expr: |
sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
labels:
severity: high
annotations:
summary: "Failed tests: {{ $value }}!"

View File

@@ -0,0 +1,89 @@
# Copyright 2024 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Working with the loki logging alert rules."""
import os
from math import (
ceil,
)
import yaml
from utils.validators import (
Schedule,
)
ALERT_RULES_PATH = "src/loki_alert_rules"
ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
# The default for max_query_length in Loki is now 721h,
# and thus the value in Loki deployed by COS.
# ref. https://github.com/grafana/loki/issues/4509
# We need a small buffer to make it work in these queries.
MAX_RANGE_HOURS = 719
def ensure_alert_rules_disabled():
"""Ensure the alert rules files don't exist."""
try:
os.remove(ALERT_RULES_FILE)
except FileNotFoundError:
pass
return
def update_alert_rules_files(schedule: Schedule) -> None:
"""Update files for alert rules based on the schedule.
`schedule` is expected to be a valid and ready Schedule.
"""
absent_range_hours = min(
# Convert seconds to hours,
# round up to avoid a range of 0,
# and double the interval to ensure it only alerts when one was definitely missed.
ceil(schedule.max_interval / 60 / 60) * 2,
# Ensure that the log query limit isn't exceeded
MAX_RANGE_HOURS,
)
rules = {
"groups": [
{
"name": "tempest-failed-tests",
"rules": [
{
"alert": "FailedTests",
"expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
"labels": {
"severity": "high",
},
"annotations": {
"summary": "Tempest periodic tests failed.",
},
},
{
"alert": "AbsentTests",
"expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
"labels": {
"severity": "high",
},
"annotations": {
"summary": "Tempest periodic tests were not run on schedule.",
},
},
],
}
]
}
with open(ALERT_RULES_FILE, "w") as f:
yaml.safe_dump(rules, f)

View File

@@ -18,25 +18,37 @@ from dataclasses import (
from datetime import ( from datetime import (
datetime, datetime,
) )
from functools import (
lru_cache,
)
from croniter import ( from croniter import (
CroniterBadDateError,
croniter, croniter,
) )
@dataclass @dataclass(frozen=True)
class Schedule: class Schedule:
"""A cron schedule that has validation information.""" """A cron schedule that has validation information."""
value: str value: str
valid: bool valid: bool
err: str err: str
# in validation, these are the maximum and minimum intervals between runs seen
max_interval: int = 0 # in seconds
min_interval: int = 0 # in seconds
@lru_cache
def validated_schedule(schedule: str) -> Schedule: def validated_schedule(schedule: str) -> Schedule:
"""Process and validate a schedule str. """Process and validate a schedule str.
Return the schedule with validation info. Return the schedule with validation info.
Part of validation includes sampling a range of matches
for the cron schedule. This can be time consuming,
so this function is cached to avoid repeating work.
""" """
# Empty schedule is fine; it means it's disabled in this context. # Empty schedule is fine; it means it's disabled in this context.
if not schedule: if not schedule:
@@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
) )
return Schedule(value=schedule, valid=False, err=msg) return Schedule(value=schedule, valid=False, err=msg)
# This is a rather naive method for enforcing this, # This is a heuristic method of checking because cron schedules aren't regular,
# and it may be possible to craft an expression # and it may be possible to craft an expression
# that results in some consecutive runs within 15 minutes, # that results in some consecutive runs within 15 minutes,
# however this is fine, as there is process locking for tempest, # however this is fine, as there is process locking for tempest,
# and this is more of a sanity check than a security requirement. # and this is more of a sanity check than a security requirement.
t1 = cron.get_next() intervals = [] # in seconds
t2 = cron.get_next() try:
if t2 - t1 < 15 * 60: # 15 minutes in seconds last = cron.get_next()
for _ in range(5):
next_ = cron.get_next()
intervals.append(next_ - last)
last = next_
except CroniterBadDateError:
return Schedule(
value=schedule,
valid=False,
err=(
"Could not calculate a range of values from the schedule; "
"please check the schedule or try a shorter schedule period."
),
)
if min(intervals) < 15 * 60: # 15 minutes in seconds
return Schedule( return Schedule(
value=schedule, value=schedule,
valid=False, valid=False,
err="Cannot schedule periodic check to run faster than every 15 minutes.", err="Cannot schedule periodic check to run faster than every 15 minutes.",
) )
return Schedule(value=schedule, valid=True, err="") return Schedule(
value=schedule,
valid=True,
err="",
max_interval=max(intervals),
min_interval=min(intervals),
)

View File

@@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.patch_obj( self.patch_obj(
utils.cleanup, "_get_exclusion_resources" utils.cleanup, "_get_exclusion_resources"
).return_value = {"projects": set(), "users": set()} ).return_value = {"projects": set(), "users": set()}
# We must keep a reference to the patcher object,
# because in a couple of tests we need to not patch this.
# self.patch_obj doesn't give us a reference to the patcher.
self.get_unit_data_patcher = patch.object(
charm.TempestOperatorCharm,
"get_unit_data",
Mock(return_value="true"),
)
self.get_unit_data_patcher.start()
self.addCleanup(self.get_unit_data_patcher.stop)
def add_identity_ops_relation(self, harness): def add_identity_ops_relation(self, harness):
"""Add identity resource relation.""" """Add identity resource relation."""
@@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.harness.update_config({"schedule": "*/21 * * * *"}) self.harness.update_config({"schedule": "*/21 * * * *"})
self.harness.charm.set_tempest_ready.has_calls( self.harness.charm.set_tempest_ready.assert_has_calls(
[call(False), call(False)] [call(False), call(False)]
) )
self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2) self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
@@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.add_identity_ops_relation(self.harness) self.add_identity_ops_relation(self.harness)
self.add_grafana_dashboard_relation(self.harness) self.add_grafana_dashboard_relation(self.harness)
# We want the real get_unit_data method here, # simulate tempest ready
# because its logic is being tested.
self.get_unit_data_patcher.stop()
self.harness.charm.peers = Mock() self.harness.charm.peers = Mock()
self.harness.charm.peers.interface.peers_rel.data = MagicMock() self.harness.charm.peers.interface.peers_rel.data = MagicMock()
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = { self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
@@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
self.add_identity_ops_relation(self.harness) self.add_identity_ops_relation(self.harness)
self.add_grafana_dashboard_relation(self.harness) self.add_grafana_dashboard_relation(self.harness)
# We want the real get_unit_data method here, # simulate tempest not ready
# because its logic is being tested.
self.get_unit_data_patcher.stop()
self.harness.charm.peers = Mock() self.harness.charm.peers = Mock()
self.harness.charm.peers.interface.peers_rel.data = MagicMock() self.harness.charm.peers.interface.peers_rel.data = MagicMock()
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = { self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {

View File

@@ -17,6 +17,9 @@
"""Unit tests for Tempest validator utility functions.""" """Unit tests for Tempest validator utility functions."""
import unittest import unittest
from dataclasses import (
FrozenInstanceError,
)
from utils.validators import ( from utils.validators import (
validated_schedule, validated_schedule,
@@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
self.assertFalse(schedule.valid) self.assertFalse(schedule.valid)
self.assertIn("not acceptable", schedule.err) self.assertIn("not acceptable", schedule.err)
self.assertEqual(schedule.value, exp) self.assertEqual(schedule.value, exp)
def test_expression_too_sparse(self):
"""Verify an expression with a very long period is caught."""
exp = "0 4 30 2 *" # on february 30 ;)
schedule = validated_schedule(exp)
self.assertFalse(schedule.valid)
self.assertIn("not calculate a range", schedule.err)
self.assertEqual(schedule.value, exp)
def test_schedule_type_is_immutable(self):
"""Schedule should be immutable."""
# this is both to avoid issues with caching it,
# and to ensure a validated schedule is not accidentally modified
# (it should not be modified because then it may not be valid any more)
schedule = validated_schedule("5 4 * * *")
self.assertTrue(schedule.valid)
with self.assertRaises(FrozenInstanceError):
schedule.valid = False