Improve tempest alert rules
Alert if results not seen for some time (if an expected test run was missed). Keep the failed test alert firing until a test run succeeds (this alert would resolve itself after 5min, which was undesirable). Change-Id: I4280cd181e709cef3bba6a114273cb32c12a9f7d
This commit is contained in:
2
charms/tempest-k8s/.gitignore
vendored
Normal file
2
charms/tempest-k8s/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# alert rules for this charm are dynamically written to disk
|
||||||
|
/src/loki_alert_rules/*.rules
|
@@ -51,6 +51,10 @@ from ops.model import (
|
|||||||
from ops_sunbeam.config_contexts import (
|
from ops_sunbeam.config_contexts import (
|
||||||
ConfigContext,
|
ConfigContext,
|
||||||
)
|
)
|
||||||
|
from utils.alert_rules import (
|
||||||
|
ensure_alert_rules_disabled,
|
||||||
|
update_alert_rules_files,
|
||||||
|
)
|
||||||
from utils.cleanup import (
|
from utils.cleanup import (
|
||||||
CleanUpError,
|
CleanUpError,
|
||||||
run_extensive_cleanup,
|
run_extensive_cleanup,
|
||||||
@@ -73,9 +77,12 @@ from utils.types import (
|
|||||||
TempestEnvVariant,
|
TempestEnvVariant,
|
||||||
)
|
)
|
||||||
from utils.validators import (
|
from utils.validators import (
|
||||||
|
Schedule,
|
||||||
validated_schedule,
|
validated_schedule,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
LOKI_RELATION_NAME = "logging"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -85,7 +92,11 @@ class TempestConfigurationContext(ConfigContext):
|
|||||||
def context(self) -> dict:
|
def context(self) -> dict:
|
||||||
"""Tempest context."""
|
"""Tempest context."""
|
||||||
return {
|
return {
|
||||||
"schedule": self.charm.get_schedule(),
|
"schedule": (
|
||||||
|
self.charm.get_schedule().value
|
||||||
|
if self.charm.is_schedule_ready()
|
||||||
|
else ""
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -131,31 +142,24 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_schedule(self) -> str:
|
def get_schedule(self) -> Schedule:
|
||||||
"""Return the schedule option if valid and should be enabled.
|
"""Validate and return the schedule from config."""
|
||||||
|
return validated_schedule(self.config["schedule"])
|
||||||
|
|
||||||
If the schedule option is invalid,
|
def is_schedule_ready(self) -> bool:
|
||||||
or periodic checks shouldn't currently be enabled
|
"""Check if the schedule is valid and periodic tests should be enabled.
|
||||||
(eg. observability relations not ready),
|
|
||||||
then return an empty schedule string.
|
Return True if the schedule config option is valid,
|
||||||
An empty string disables the schedule.
|
and pre-requisites for periodic checks are ready.
|
||||||
"""
|
"""
|
||||||
schedule = validated_schedule(self.config["schedule"])
|
schedule = self.get_schedule()
|
||||||
if not schedule.valid:
|
return (
|
||||||
return ""
|
schedule.valid
|
||||||
|
and schedule.value
|
||||||
# if tempest env isn't ready,
|
and self.is_tempest_ready()
|
||||||
# or if the logging relation isn't joined,
|
|
||||||
# or if keystone isn't ready,
|
|
||||||
# then we can't start scheduling periodic tests
|
|
||||||
if not (
|
|
||||||
self.is_tempest_ready()
|
|
||||||
and self.loki.ready
|
and self.loki.ready
|
||||||
and self.user_id_ops.ready
|
and self.user_id_ops.ready
|
||||||
):
|
)
|
||||||
return ""
|
|
||||||
|
|
||||||
return schedule.value
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def config_contexts(self) -> List[ConfigContext]:
|
def config_contexts(self) -> List[ConfigContext]:
|
||||||
@@ -188,7 +192,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
|||||||
handlers.append(self.user_id_ops)
|
handlers.append(self.user_id_ops)
|
||||||
self.loki = LoggingRelationHandler(
|
self.loki = LoggingRelationHandler(
|
||||||
self,
|
self,
|
||||||
"logging",
|
LOKI_RELATION_NAME,
|
||||||
self.configure_charm,
|
self.configure_charm,
|
||||||
mandatory="logging" in self.mandatory_relations,
|
mandatory="logging" in self.mandatory_relations,
|
||||||
)
|
)
|
||||||
@@ -322,7 +326,7 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
|||||||
|
|
||||||
logger.info("Configuring the tempest environment")
|
logger.info("Configuring the tempest environment")
|
||||||
|
|
||||||
schedule = validated_schedule(self.config["schedule"])
|
schedule = self.get_schedule()
|
||||||
if not schedule.valid:
|
if not schedule.valid:
|
||||||
raise sunbeam_guard.BlockedExceptionError(
|
raise sunbeam_guard.BlockedExceptionError(
|
||||||
f"invalid schedule config: {schedule.err}"
|
f"invalid schedule config: {schedule.err}"
|
||||||
@@ -340,6 +344,16 @@ class TempestOperatorCharm(sunbeam_charm.OSBaseOperatorCharmK8S):
|
|||||||
"tempest init failed, see logs for more info"
|
"tempest init failed, see logs for more info"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ensure the alert rules are in sync with charm config.
|
||||||
|
if self.is_schedule_ready():
|
||||||
|
update_alert_rules_files(schedule)
|
||||||
|
else:
|
||||||
|
ensure_alert_rules_disabled()
|
||||||
|
|
||||||
|
if self.loki.ready:
|
||||||
|
for relation in self.model.relations[LOKI_RELATION_NAME]:
|
||||||
|
self.loki.interface._handle_alert_rules(relation)
|
||||||
|
|
||||||
self.status.set(ActiveStatus(""))
|
self.status.set(ActiveStatus(""))
|
||||||
logger.info("Finished configuring the tempest environment")
|
logger.info("Finished configuring the tempest environment")
|
||||||
|
|
||||||
|
@@ -37,6 +37,9 @@ import ops.model
|
|||||||
import ops.pebble
|
import ops.pebble
|
||||||
import ops_sunbeam.container_handlers as sunbeam_chandlers
|
import ops_sunbeam.container_handlers as sunbeam_chandlers
|
||||||
import ops_sunbeam.relation_handlers as sunbeam_rhandlers
|
import ops_sunbeam.relation_handlers as sunbeam_rhandlers
|
||||||
|
from utils.alert_rules import (
|
||||||
|
ALERT_RULES_PATH,
|
||||||
|
)
|
||||||
from utils.cleanup import (
|
from utils.cleanup import (
|
||||||
CleanUpError,
|
CleanUpError,
|
||||||
run_extensive_cleanup,
|
run_extensive_cleanup,
|
||||||
@@ -669,7 +672,7 @@ class LoggingRelationHandler(sunbeam_rhandlers.RelationHandler):
|
|||||||
self.charm,
|
self.charm,
|
||||||
recursive=True,
|
recursive=True,
|
||||||
relation_name=self.relation_name,
|
relation_name=self.relation_name,
|
||||||
alert_rules_path="src/loki_alert_rules",
|
alert_rules_path=ALERT_RULES_PATH,
|
||||||
logs_scheme={
|
logs_scheme={
|
||||||
"tempest": {
|
"tempest": {
|
||||||
"log-files": [
|
"log-files": [
|
||||||
|
0
charms/tempest-k8s/src/loki_alert_rules/.gitkeep
Normal file
0
charms/tempest-k8s/src/loki_alert_rules/.gitkeep
Normal file
@@ -1,10 +0,0 @@
|
|||||||
groups:
|
|
||||||
- name: tempest-failed-tests
|
|
||||||
rules:
|
|
||||||
- alert: FailedTests
|
|
||||||
expr: |
|
|
||||||
sum_over_time({filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [5m]) > 0
|
|
||||||
labels:
|
|
||||||
severity: high
|
|
||||||
annotations:
|
|
||||||
summary: "Failed tests: {{ $value }}!"
|
|
89
charms/tempest-k8s/src/utils/alert_rules.py
Normal file
89
charms/tempest-k8s/src/utils/alert_rules.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# Copyright 2024 Canonical Ltd.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Working with the loki logging alert rules."""
|
||||||
|
import os
|
||||||
|
from math import (
|
||||||
|
ceil,
|
||||||
|
)
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from utils.validators import (
|
||||||
|
Schedule,
|
||||||
|
)
|
||||||
|
|
||||||
|
ALERT_RULES_PATH = "src/loki_alert_rules"
|
||||||
|
ALERT_RULES_FILE = ALERT_RULES_PATH + "/tests.rules"
|
||||||
|
|
||||||
|
# The default for max_query_length in Loki is now 721h,
|
||||||
|
# and thus the value in Loki deployed by COS.
|
||||||
|
# ref. https://github.com/grafana/loki/issues/4509
|
||||||
|
# We need a small buffer to make it work in these queries.
|
||||||
|
MAX_RANGE_HOURS = 719
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_alert_rules_disabled():
|
||||||
|
"""Ensure the alert rules files don't exist."""
|
||||||
|
try:
|
||||||
|
os.remove(ALERT_RULES_FILE)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def update_alert_rules_files(schedule: Schedule) -> None:
|
||||||
|
"""Update files for alert rules based on the schedule.
|
||||||
|
|
||||||
|
`schedule` is expected to be a valid and ready Schedule.
|
||||||
|
"""
|
||||||
|
absent_range_hours = min(
|
||||||
|
# Convert seconds to hours,
|
||||||
|
# round up to avoid a range of 0,
|
||||||
|
# and double the interval to ensure it only alerts when one was definitely missed.
|
||||||
|
ceil(schedule.max_interval / 60 / 60) * 2,
|
||||||
|
# Ensure that the log query limit isn't exceeded
|
||||||
|
MAX_RANGE_HOURS,
|
||||||
|
)
|
||||||
|
|
||||||
|
rules = {
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"name": "tempest-failed-tests",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "FailedTests",
|
||||||
|
"expr": f'last_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" | pattern " - <_>: <number_of_tests>" | unwrap number_of_tests [{MAX_RANGE_HOURS}h]) > 0',
|
||||||
|
"labels": {
|
||||||
|
"severity": "high",
|
||||||
|
},
|
||||||
|
"annotations": {
|
||||||
|
"summary": "Tempest periodic tests failed.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "AbsentTests",
|
||||||
|
"expr": f'absent_over_time({{filename="/var/lib/tempest/workspace/tempest-periodic.log", %%juju_topology%%}} |~ "- Failed:" [{absent_range_hours}h]) == 1',
|
||||||
|
"labels": {
|
||||||
|
"severity": "high",
|
||||||
|
},
|
||||||
|
"annotations": {
|
||||||
|
"summary": "Tempest periodic tests were not run on schedule.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(ALERT_RULES_FILE, "w") as f:
|
||||||
|
yaml.safe_dump(rules, f)
|
@@ -18,25 +18,37 @@ from dataclasses import (
|
|||||||
from datetime import (
|
from datetime import (
|
||||||
datetime,
|
datetime,
|
||||||
)
|
)
|
||||||
|
from functools import (
|
||||||
|
lru_cache,
|
||||||
|
)
|
||||||
|
|
||||||
from croniter import (
|
from croniter import (
|
||||||
|
CroniterBadDateError,
|
||||||
croniter,
|
croniter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(frozen=True)
|
||||||
class Schedule:
|
class Schedule:
|
||||||
"""A cron schedule that has validation information."""
|
"""A cron schedule that has validation information."""
|
||||||
|
|
||||||
value: str
|
value: str
|
||||||
valid: bool
|
valid: bool
|
||||||
err: str
|
err: str
|
||||||
|
# in validation, these are the maximum and minimum intervals between runs seen
|
||||||
|
max_interval: int = 0 # in seconds
|
||||||
|
min_interval: int = 0 # in seconds
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
def validated_schedule(schedule: str) -> Schedule:
|
def validated_schedule(schedule: str) -> Schedule:
|
||||||
"""Process and validate a schedule str.
|
"""Process and validate a schedule str.
|
||||||
|
|
||||||
Return the schedule with validation info.
|
Return the schedule with validation info.
|
||||||
|
|
||||||
|
Part of validation includes sampling a range of matches
|
||||||
|
for the cron schedule. This can be time consuming,
|
||||||
|
so this function is cached to avoid repeating work.
|
||||||
"""
|
"""
|
||||||
# Empty schedule is fine; it means it's disabled in this context.
|
# Empty schedule is fine; it means it's disabled in this context.
|
||||||
if not schedule:
|
if not schedule:
|
||||||
@@ -66,18 +78,39 @@ def validated_schedule(schedule: str) -> Schedule:
|
|||||||
)
|
)
|
||||||
return Schedule(value=schedule, valid=False, err=msg)
|
return Schedule(value=schedule, valid=False, err=msg)
|
||||||
|
|
||||||
# This is a rather naive method for enforcing this,
|
# This is a heuristic method of checking because cron schedules aren't regular,
|
||||||
# and it may be possible to craft an expression
|
# and it may be possible to craft an expression
|
||||||
# that results in some consecutive runs within 15 minutes,
|
# that results in some consecutive runs within 15 minutes,
|
||||||
# however this is fine, as there is process locking for tempest,
|
# however this is fine, as there is process locking for tempest,
|
||||||
# and this is more of a sanity check than a security requirement.
|
# and this is more of a sanity check than a security requirement.
|
||||||
t1 = cron.get_next()
|
intervals = [] # in seconds
|
||||||
t2 = cron.get_next()
|
try:
|
||||||
if t2 - t1 < 15 * 60: # 15 minutes in seconds
|
last = cron.get_next()
|
||||||
|
for _ in range(5):
|
||||||
|
next_ = cron.get_next()
|
||||||
|
intervals.append(next_ - last)
|
||||||
|
last = next_
|
||||||
|
except CroniterBadDateError:
|
||||||
|
return Schedule(
|
||||||
|
value=schedule,
|
||||||
|
valid=False,
|
||||||
|
err=(
|
||||||
|
"Could not calculate a range of values from the schedule; "
|
||||||
|
"please check the schedule or try a shorter schedule period."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if min(intervals) < 15 * 60: # 15 minutes in seconds
|
||||||
return Schedule(
|
return Schedule(
|
||||||
value=schedule,
|
value=schedule,
|
||||||
valid=False,
|
valid=False,
|
||||||
err="Cannot schedule periodic check to run faster than every 15 minutes.",
|
err="Cannot schedule periodic check to run faster than every 15 minutes.",
|
||||||
)
|
)
|
||||||
|
|
||||||
return Schedule(value=schedule, valid=True, err="")
|
return Schedule(
|
||||||
|
value=schedule,
|
||||||
|
valid=True,
|
||||||
|
err="",
|
||||||
|
max_interval=max(intervals),
|
||||||
|
min_interval=min(intervals),
|
||||||
|
)
|
||||||
|
@@ -111,16 +111,6 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
|||||||
self.patch_obj(
|
self.patch_obj(
|
||||||
utils.cleanup, "_get_exclusion_resources"
|
utils.cleanup, "_get_exclusion_resources"
|
||||||
).return_value = {"projects": set(), "users": set()}
|
).return_value = {"projects": set(), "users": set()}
|
||||||
# We must keep a reference to the patcher object,
|
|
||||||
# because in a couple of tests we need to not patch this.
|
|
||||||
# self.patch_obj doesn't give us a reference to the patcher.
|
|
||||||
self.get_unit_data_patcher = patch.object(
|
|
||||||
charm.TempestOperatorCharm,
|
|
||||||
"get_unit_data",
|
|
||||||
Mock(return_value="true"),
|
|
||||||
)
|
|
||||||
self.get_unit_data_patcher.start()
|
|
||||||
self.addCleanup(self.get_unit_data_patcher.stop)
|
|
||||||
|
|
||||||
def add_identity_ops_relation(self, harness):
|
def add_identity_ops_relation(self, harness):
|
||||||
"""Add identity resource relation."""
|
"""Add identity resource relation."""
|
||||||
@@ -491,7 +481,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
|||||||
|
|
||||||
self.harness.update_config({"schedule": "*/21 * * * *"})
|
self.harness.update_config({"schedule": "*/21 * * * *"})
|
||||||
|
|
||||||
self.harness.charm.set_tempest_ready.has_calls(
|
self.harness.charm.set_tempest_ready.assert_has_calls(
|
||||||
[call(False), call(False)]
|
[call(False), call(False)]
|
||||||
)
|
)
|
||||||
self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
|
self.assertEqual(self.harness.charm.set_tempest_ready.call_count, 2)
|
||||||
@@ -507,9 +497,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
|||||||
self.add_identity_ops_relation(self.harness)
|
self.add_identity_ops_relation(self.harness)
|
||||||
self.add_grafana_dashboard_relation(self.harness)
|
self.add_grafana_dashboard_relation(self.harness)
|
||||||
|
|
||||||
# We want the real get_unit_data method here,
|
# simulate tempest ready
|
||||||
# because its logic is being tested.
|
|
||||||
self.get_unit_data_patcher.stop()
|
|
||||||
self.harness.charm.peers = Mock()
|
self.harness.charm.peers = Mock()
|
||||||
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
||||||
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
||||||
@@ -525,9 +513,7 @@ class TestTempestOperatorCharm(test_utils.CharmTestCase):
|
|||||||
self.add_identity_ops_relation(self.harness)
|
self.add_identity_ops_relation(self.harness)
|
||||||
self.add_grafana_dashboard_relation(self.harness)
|
self.add_grafana_dashboard_relation(self.harness)
|
||||||
|
|
||||||
# We want the real get_unit_data method here,
|
# simulate tempest not ready
|
||||||
# because its logic is being tested.
|
|
||||||
self.get_unit_data_patcher.stop()
|
|
||||||
self.harness.charm.peers = Mock()
|
self.harness.charm.peers = Mock()
|
||||||
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
self.harness.charm.peers.interface.peers_rel.data = MagicMock()
|
||||||
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
self.harness.charm.peers.interface.peers_rel.data.__getitem__.return_value = {
|
||||||
|
@@ -17,6 +17,9 @@
|
|||||||
"""Unit tests for Tempest validator utility functions."""
|
"""Unit tests for Tempest validator utility functions."""
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from dataclasses import (
|
||||||
|
FrozenInstanceError,
|
||||||
|
)
|
||||||
|
|
||||||
from utils.validators import (
|
from utils.validators import (
|
||||||
validated_schedule,
|
validated_schedule,
|
||||||
@@ -85,3 +88,21 @@ class TempestCharmValidatorTests(unittest.TestCase):
|
|||||||
self.assertFalse(schedule.valid)
|
self.assertFalse(schedule.valid)
|
||||||
self.assertIn("not acceptable", schedule.err)
|
self.assertIn("not acceptable", schedule.err)
|
||||||
self.assertEqual(schedule.value, exp)
|
self.assertEqual(schedule.value, exp)
|
||||||
|
|
||||||
|
def test_expression_too_sparse(self):
|
||||||
|
"""Verify an expression with a very long period is caught."""
|
||||||
|
exp = "0 4 30 2 *" # on february 30 ;)
|
||||||
|
schedule = validated_schedule(exp)
|
||||||
|
self.assertFalse(schedule.valid)
|
||||||
|
self.assertIn("not calculate a range", schedule.err)
|
||||||
|
self.assertEqual(schedule.value, exp)
|
||||||
|
|
||||||
|
def test_schedule_type_is_immutable(self):
|
||||||
|
"""Schedule should be immutable."""
|
||||||
|
# this is both to avoid issues with caching it,
|
||||||
|
# and to ensure a validated schedule is not accidentally modified
|
||||||
|
# (it should not be modified because then it may not be valid any more)
|
||||||
|
schedule = validated_schedule("5 4 * * *")
|
||||||
|
self.assertTrue(schedule.valid)
|
||||||
|
with self.assertRaises(FrozenInstanceError):
|
||||||
|
schedule.valid = False
|
||||||
|
Reference in New Issue
Block a user