Add periodic cleanup of stale conductors
A new periodic task to automatically remove conductor records that have been offline for longer than a configured timeout period. This addresses the issue where deleted or decommissioned conductors would remain in the database indefinitely. Closes-Bug: #2069771 Assisted-by: Claude Sonnet 4.0 Change-Id: I90eb159abad94d8369b8792fa17c20d80201569a Signed-off-by: Afonne-CID <afonnepaulc@gmail.com>
This commit is contained in:
@@ -41,6 +41,7 @@ notifying Neutron of a change, etc.
|
||||
"""
|
||||
|
||||
import collections
|
||||
import datetime
|
||||
import queue
|
||||
import time
|
||||
|
||||
@@ -62,6 +63,7 @@ from ironic.common import network
|
||||
from ironic.common import nova
|
||||
from ironic.common import rpc
|
||||
from ironic.common import states
|
||||
from ironic.common import utils as common_utils
|
||||
from ironic.conductor import allocations
|
||||
from ironic.conductor import base_manager
|
||||
from ironic.conductor import cleaning
|
||||
@@ -3780,6 +3782,116 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
LOG.error('Encountered error while cleaning node '
|
||||
'history records: %s', e)
|
||||
|
||||
@METRICS.timer('ConductorManager.cleanup_stale_conductors')
|
||||
@periodics.periodic(
|
||||
spacing=CONF.conductor.conductor_cleanup_interval,
|
||||
enabled=CONF.conductor.conductor_cleanup_interval > 0
|
||||
)
|
||||
def cleanup_stale_conductors(self, context):
|
||||
"""Periodically clean up stale conductors from the database.
|
||||
|
||||
This task removes conductors that have been offline for longer than
|
||||
the configured timeout period. This helps prevent accumulation of
|
||||
stale conductor records in the database.
|
||||
"""
|
||||
try:
|
||||
cleanup_timeout = CONF.conductor.conductor_cleanup_timeout
|
||||
heartbeat_timeout = CONF.conductor.heartbeat_timeout
|
||||
|
||||
if heartbeat_timeout <= 0:
|
||||
LOG.warning(
|
||||
'Skipping stale conductor cleanup due to invalid '
|
||||
'configuration: heartbeat_timeout is invalid (%s).',
|
||||
heartbeat_timeout
|
||||
)
|
||||
return
|
||||
|
||||
# We require conductor_cleanup_timeout to be at least 3x
|
||||
# heartbeat_timeout to provide a significant safety margin.
|
||||
# This ensures that active conductors won't be mistakenly
|
||||
# removed from the database.
|
||||
min_required = heartbeat_timeout * 3
|
||||
|
||||
if cleanup_timeout < min_required:
|
||||
error_msg = _(
|
||||
'Skipping stale conductor cleanup due to invalid '
|
||||
'configuration: [conductor]conductor_cleanup_timeout '
|
||||
'(%(cleanup_timeout)s) must be at least 3x '
|
||||
'[conductor]heartbeat_timeout (%(heartbeat_timeout)s) '
|
||||
'(recommended minimum: %(min_required)s). This is '
|
||||
'required to prevent active conductors from being '
|
||||
'mistakenly removed.') % {
|
||||
'cleanup_timeout': cleanup_timeout,
|
||||
'heartbeat_timeout': heartbeat_timeout,
|
||||
'min_required': min_required}
|
||||
LOG.warning(error_msg)
|
||||
return
|
||||
|
||||
self._cleanup_stale_conductors(context)
|
||||
except Exception as e:
|
||||
LOG.error(
|
||||
'Encountered error while cleaning up stale conductors: %s', e)
|
||||
|
||||
def _cleanup_stale_conductors(self, context):
|
||||
"""Clean up stale conductors from the database.
|
||||
|
||||
:param context: request context.
|
||||
"""
|
||||
timeout = CONF.conductor.conductor_cleanup_timeout
|
||||
batch_size = CONF.conductor.conductor_cleanup_batch_size
|
||||
|
||||
# Get conductors that have been offline for longer than the timeout
|
||||
if not common_utils.is_ironic_using_sqlite():
|
||||
|
||||
# For non-SQLite databases, we need to check the updated_at
|
||||
# timestamp because the database may be shared by multiple
|
||||
# conductors and we want to avoid deleting records for conductors
|
||||
# that may still be alive but have not updated their heartbeat
|
||||
# recently enough.
|
||||
limit = (timeutils.utcnow() - datetime.timedelta(seconds=timeout))
|
||||
stale_conductors = self.dbapi.get_offline_conductors()
|
||||
|
||||
# Filter by timestamp
|
||||
conductors_to_delete = []
|
||||
for hostname in stale_conductors:
|
||||
try:
|
||||
conductor = objects.Conductor.get_by_hostname(
|
||||
context, hostname, online=None)
|
||||
if conductor.updated_at < limit:
|
||||
conductors_to_delete.append(hostname)
|
||||
if len(conductors_to_delete) >= batch_size:
|
||||
break
|
||||
except exception.ConductorNotFound:
|
||||
# Conductor was already deleted, skip
|
||||
continue
|
||||
else:
|
||||
# For SQLite, just get offline conductors
|
||||
stale_conductors = self.dbapi.get_offline_conductors()
|
||||
conductors_to_delete = stale_conductors[:batch_size]
|
||||
|
||||
if not conductors_to_delete:
|
||||
return
|
||||
|
||||
LOG.info('Cleaning up %(count)d stale conductors: %(conductors)s',
|
||||
{'count': len(conductors_to_delete),
|
||||
'conductors': conductors_to_delete})
|
||||
|
||||
deleted_count = 0
|
||||
for hostname in conductors_to_delete:
|
||||
try:
|
||||
self.dbapi.delete_conductor(hostname)
|
||||
deleted_count += 1
|
||||
except exception.ConductorNotFound:
|
||||
# Conductor was already deleted by another process
|
||||
continue
|
||||
except Exception as e:
|
||||
LOG.error('Failed to delete conductor %(hostname)s: %(error)s',
|
||||
{'hostname': hostname, 'error': e})
|
||||
|
||||
if deleted_count > 0:
|
||||
LOG.info('Successfully cleaned up %(count)d stale conductors',
|
||||
{'count': deleted_count})
|
||||
|
||||
def _manage_node_history(self, context):
|
||||
"""Periodic task to keep the node history tidy."""
|
||||
max_batch = CONF.conductor.node_history_cleanup_batch_count
|
||||
|
@@ -404,6 +404,32 @@ opts = [
|
||||
'node_history_max_entries setting as users of '
|
||||
'this setting are anticipated to need to retain '
|
||||
'history by policy.')),
|
||||
cfg.IntOpt('conductor_cleanup_interval',
|
||||
min=0,
|
||||
default=86400,
|
||||
mutable=False,
|
||||
help=_('Interval in seconds at which stale conductor entries '
|
||||
'can be cleaned up from the database. Setting to 0 '
|
||||
'disables the periodic task. Defaults to 86400 (1 day).'
|
||||
)),
|
||||
cfg.IntOpt('conductor_cleanup_timeout',
|
||||
min=60,
|
||||
default=1209600,
|
||||
mutable=True,
|
||||
help=_('Timeout in seconds after which offline conductors '
|
||||
'are considered stale and can be cleaned up from the '
|
||||
'database. It defaults to two weeks (1209600 seconds) '
|
||||
'and is always required to be at least 3x larger than '
|
||||
'[conductor]heartbeat_timeout since if otherwise, '
|
||||
'active conductors might be mistakenly removed from '
|
||||
'the database.')),
|
||||
cfg.IntOpt('conductor_cleanup_batch_size',
|
||||
min=1,
|
||||
default=50,
|
||||
mutable=True,
|
||||
help=_('The maximum number of stale conductor records to clean '
|
||||
'up from the database in a single cleanup operation. '
|
||||
'Defaults to 50.')),
|
||||
cfg.MultiOpt('verify_step_priority_override',
|
||||
item_type=types.Dict(),
|
||||
default={},
|
||||
|
@@ -594,6 +594,14 @@ class Connection(object, metaclass=abc.ABCMeta):
|
||||
:raises: ConductorNotFound
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def delete_conductor(self, hostname):
|
||||
"""Delete a conductor from the database.
|
||||
|
||||
:param hostname: The hostname of this conductor service.
|
||||
:raises: ConductorNotFound if the conductor doesn't exist.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def touch_conductor(self, hostname, online=True):
|
||||
"""Mark a conductor as active by updating its 'updated_at' property.
|
||||
|
@@ -1457,6 +1457,41 @@ class Connection(api.Connection):
|
||||
if count == 0:
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def delete_conductor(self, hostname):
|
||||
with _session_for_write() as session:
|
||||
|
||||
# Before deleting the conductor, we clear any node
|
||||
# reservations to prevent nodes from being left in a reserved
|
||||
# state by a non-existent conductor which could block other
|
||||
# conductors from managing those nodes.
|
||||
self.clear_node_reservations_for_conductor(hostname)
|
||||
|
||||
# Reset state to prevent transitional or inconsistent states and
|
||||
# orphaned power state requests after deletion.
|
||||
self.clear_node_target_power_state(hostname)
|
||||
|
||||
# Delete conductor hardware interfaces
|
||||
query = sa.delete(models.ConductorHardwareInterfaces).where(
|
||||
models.ConductorHardwareInterfaces.conductor_id == (
|
||||
session.query(models.Conductor.id).where(
|
||||
models.Conductor.hostname == hostname
|
||||
).scalar_subquery()
|
||||
)
|
||||
)
|
||||
session.execute(query)
|
||||
|
||||
query = sa.delete(models.Conductor).where(
|
||||
models.Conductor.hostname == hostname
|
||||
)
|
||||
result = session.execute(query)
|
||||
count = result.rowcount
|
||||
if count == 0:
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
LOG.info('Deleted conductor with hostname %(hostname)s',
|
||||
{'hostname': hostname})
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def touch_conductor(self, hostname, online=True):
|
||||
with _session_for_write() as session:
|
||||
|
@@ -158,6 +158,17 @@ class Conductor(base.IronicObject, object_base.VersionedObjectDictCompat):
|
||||
self.unregister_all_hardware_interfaces()
|
||||
self.dbapi.unregister_conductor(self.hostname)
|
||||
|
||||
@classmethod
|
||||
def delete(cls, context, hostname):
|
||||
"""Delete a conductor from the database.
|
||||
|
||||
:param cls: the :class:`Conductor`
|
||||
:param context: Security context
|
||||
:param hostname: the hostname of the conductor to delete
|
||||
:raises: ConductorNotFound if the conductor doesn't exist
|
||||
"""
|
||||
cls.dbapi.delete_conductor(hostname)
|
||||
|
||||
def register_hardware_interfaces(self, interfaces):
|
||||
"""Register hardware interfaces with the conductor.
|
||||
|
||||
|
@@ -9229,3 +9229,92 @@ class VirtualMediaTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.node.refresh()
|
||||
self.assertIn("Could not attach device cdrom", self.node.last_error)
|
||||
self.assertIn("disabled or not implemented", self.node.last_error)
|
||||
|
||||
|
||||
class ConductorCleanupTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
db_base.DbTestCase):
|
||||
"""Test stale conductors cleanup."""
|
||||
def test_cleanup_stale_conductors(self):
|
||||
"""Test the periodic cleanup of stale conductors."""
|
||||
self._start_service()
|
||||
|
||||
self.conductor1 = self.dbapi.register_conductor({
|
||||
'hostname': 'conductor-1',
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'group1'
|
||||
})
|
||||
self.conductor2 = self.dbapi.register_conductor({
|
||||
'hostname': 'conductor-2',
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'group2'
|
||||
})
|
||||
|
||||
self.dbapi.touch_conductor('conductor-1', online=False)
|
||||
|
||||
with mock.patch.object(self.dbapi, 'get_offline_conductors',
|
||||
return_value=['conductor-1'], autospec=True):
|
||||
self.service._cleanup_stale_conductors(self.context)
|
||||
|
||||
self.assertRaises(exception.ConductorNotFound,
|
||||
self.dbapi.get_conductor, 'conductor-1')
|
||||
|
||||
conductor2 = self.dbapi.get_conductor('conductor-2')
|
||||
self.assertEqual('conductor-2', conductor2.hostname)
|
||||
|
||||
def test_cleanup_stale_conductors_disabled(self):
|
||||
"""Test that cleanup is disabled when configured to do so."""
|
||||
self._start_service()
|
||||
|
||||
conductors = []
|
||||
for i in range(5):
|
||||
self.dbapi.register_conductor({
|
||||
'hostname': 'conductor-%d' % i,
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'group%d' % i
|
||||
})
|
||||
self.dbapi.touch_conductor('conductor-%d' % i, online=False)
|
||||
conductors.append('conductor-%d' % i)
|
||||
|
||||
self.config(conductor_cleanup_batch_size=2, group='conductor')
|
||||
|
||||
with mock.patch.object(self.dbapi, 'get_offline_conductors',
|
||||
return_value=conductors, autospec=True):
|
||||
with mock.patch.object(self.dbapi, 'delete_conductor',
|
||||
autospec=True) as mock_delete:
|
||||
self.service._cleanup_stale_conductors(self.context)
|
||||
|
||||
self.assertEqual(2, mock_delete.call_count)
|
||||
|
||||
def _run_cleanup_stale_conductors_and_check_warning(
|
||||
self, heartbeat_timeout, conductor_cleanup_timeout,
|
||||
expected_warning_substring=None):
|
||||
self._start_service()
|
||||
self.config(heartbeat_timeout=heartbeat_timeout, group='conductor')
|
||||
self.config(conductor_cleanup_timeout=conductor_cleanup_timeout,
|
||||
group='conductor')
|
||||
self.config(conductor_cleanup_batch_size=2, group='conductor')
|
||||
with mock.patch.object(self.dbapi, 'get_offline_conductors',
|
||||
return_value=[], autospec=True):
|
||||
with mock.patch('ironic.conductor.manager.LOG',
|
||||
autospec=True) as mock_log:
|
||||
self.service.cleanup_stale_conductors(self.context)
|
||||
if expected_warning_substring is None:
|
||||
mock_log.warning.assert_not_called()
|
||||
else:
|
||||
mock_log.warning.assert_called_once()
|
||||
self.assertIn(expected_warning_substring,
|
||||
mock_log.warning.call_args[0][0])
|
||||
|
||||
def test_cleanup_stale_conductors_configs_validation(self):
|
||||
test_cases = [
|
||||
(60, 180, None), # valid: exactly 3x
|
||||
(60, 200, None), # valid: greater than 3x
|
||||
(0, 180, 'heartbeat_timeout is invalid'),
|
||||
(60, 120, 'conductor_cleanup_timeout (120) must be at least 3x'),
|
||||
]
|
||||
for heartbeat_timeout, cleanup_timeout, expected_warning in test_cases:
|
||||
self._run_cleanup_stale_conductors_and_check_warning(
|
||||
heartbeat_timeout=heartbeat_timeout,
|
||||
conductor_cleanup_timeout=cleanup_timeout,
|
||||
expected_warning_substring=expected_warning
|
||||
)
|
||||
|
@@ -498,3 +498,60 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
# default for all unit tests running.
|
||||
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
|
||||
self.assertEqual(4, len(result))
|
||||
|
||||
def test_delete_conductor(self):
|
||||
self.dbapi.register_conductor(
|
||||
{'hostname': 'test-conductor',
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'test-group'})
|
||||
|
||||
self.dbapi.get_conductor('test-conductor')
|
||||
self.dbapi.delete_conductor('test-conductor')
|
||||
|
||||
self.assertRaises(exception.ConductorNotFound,
|
||||
self.dbapi.get_conductor, 'test-conductor')
|
||||
|
||||
def test_delete_conductor_not_found(self):
|
||||
self.assertRaises(exception.ConductorNotFound,
|
||||
self.dbapi.delete_conductor,
|
||||
'fake-hostname')
|
||||
|
||||
def test_delete_conductor_clears_reservations(self):
|
||||
self.dbapi.register_conductor(
|
||||
{'hostname': 'test-conductor',
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'test-group'})
|
||||
|
||||
node = self.dbapi.create_node({
|
||||
'uuid': 'test-node-uuid',
|
||||
'driver': 'fake',
|
||||
'reservation': 'test-conductor'
|
||||
})
|
||||
|
||||
self.assertEqual('test-conductor', node.reservation)
|
||||
self.dbapi.delete_conductor('test-conductor')
|
||||
|
||||
node = self.dbapi.get_node_by_uuid('test-node-uuid')
|
||||
self.assertIsNone(node.reservation)
|
||||
|
||||
def test_delete_conductor_clears_hardware_interfaces(self):
|
||||
conductor = self.dbapi.register_conductor(
|
||||
{'hostname': 'test-conductor',
|
||||
'drivers': ['fake'],
|
||||
'conductor_group': 'test-group'})
|
||||
|
||||
self.dbapi.register_conductor_hardware_interfaces(
|
||||
conductor['id'],
|
||||
[{'hardware_type': 'fake',
|
||||
'interface_type': 'deploy',
|
||||
'interface_name': 'direct',
|
||||
'default': True}])
|
||||
|
||||
interfaces = self.dbapi.list_conductor_hardware_interfaces(
|
||||
conductor['id'])
|
||||
self.assertEqual(1, len(interfaces))
|
||||
|
||||
self.dbapi.delete_conductor('test-conductor')
|
||||
interfaces = self.dbapi.list_conductor_hardware_interfaces(
|
||||
conductor['id'])
|
||||
self.assertEqual([], interfaces)
|
||||
|
@@ -0,0 +1,32 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Adds periodic cleanup of stale conductor entries from the database.
|
||||
A new periodic task automatically removes conductor records that have
|
||||
been offline for longer than the configured timeout period, helping
|
||||
prevent accumulation of stale conductor entries.
|
||||
|
||||
Three new configuration options have been added to the ``[conductor]``
|
||||
section:
|
||||
|
||||
* ``conductor_cleanup_interval`` - The interval in seconds of how often
|
||||
to run the cleanup task (default: 86400 seconds - 1 day).
|
||||
For example: 86400 seconds = 1 day, 604800 seconds = 1 week,
|
||||
2592000 seconds = 30 days (approx. 1 month).
|
||||
* ``conductor_cleanup_timeout`` - How long a conductor must be offline
|
||||
before it's considered stale and eligible for cleanup
|
||||
(default: 1209600 seconds - 2 weeks). This value is always required to be
|
||||
at least 3x larger than ``[conductor]heartbeat_timeout`` since if
|
||||
otherwise, active conductors might be mistakenly removed from the
|
||||
database. The cleanup task will skip execution and log a warning if
|
||||
this requirement is not met.
|
||||
* ``conductor_cleanup_batch_size`` - Maximum number of stale conductors
|
||||
to clean up in a single operation (default: 50).
|
||||
|
||||
fixes:
|
||||
- |
|
||||
[`Bug 2069771 <https://bugs.launchpad.net/ironic/+bug/2069771>`_]
|
||||
Fixes an issue where deleted or decommissioned conductors would remain
|
||||
in ``openstack baremetal conductor list`` indefinitely with
|
||||
``Alive = False`` status. The new periodic cleanup task automatically
|
||||
removes these stale entries after the configured timeout period.
|
Reference in New Issue
Block a user