Add periodic cleanup of stale conductors

A new periodic task to automatically remove conductor
records that have been offline for longer than a configured timeout
period. This addresses the issue where deleted or decommissioned
conductors would remain in the database indefinitely.

Closes-Bug: #2069771
Assisted-by: Claude Sonnet 4.0
Change-Id: I90eb159abad94d8369b8792fa17c20d80201569a
Signed-off-by: Afonne-CID <afonnepaulc@gmail.com>
This commit is contained in:
Afonne-CID
2025-08-04 17:33:51 +01:00
parent 90346e5985
commit 6135a00c3b
8 changed files with 370 additions and 0 deletions

View File

@@ -41,6 +41,7 @@ notifying Neutron of a change, etc.
"""
import collections
import datetime
import queue
import time
@@ -62,6 +63,7 @@ from ironic.common import network
from ironic.common import nova
from ironic.common import rpc
from ironic.common import states
from ironic.common import utils as common_utils
from ironic.conductor import allocations
from ironic.conductor import base_manager
from ironic.conductor import cleaning
@@ -3780,6 +3782,116 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.error('Encountered error while cleaning node '
'history records: %s', e)
@METRICS.timer('ConductorManager.cleanup_stale_conductors')
@periodics.periodic(
spacing=CONF.conductor.conductor_cleanup_interval,
enabled=CONF.conductor.conductor_cleanup_interval > 0
)
def cleanup_stale_conductors(self, context):
"""Periodically clean up stale conductors from the database.
This task removes conductors that have been offline for longer than
the configured timeout period. This helps prevent accumulation of
stale conductor records in the database.
"""
try:
cleanup_timeout = CONF.conductor.conductor_cleanup_timeout
heartbeat_timeout = CONF.conductor.heartbeat_timeout
if heartbeat_timeout <= 0:
LOG.warning(
'Skipping stale conductor cleanup due to invalid '
'configuration: heartbeat_timeout is invalid (%s).',
heartbeat_timeout
)
return
# We require conductor_cleanup_timeout to be at least 3x
# heartbeat_timeout to provide a significant safety margin.
# This ensures that active conductors won't be mistakenly
# removed from the database.
min_required = heartbeat_timeout * 3
if cleanup_timeout < min_required:
error_msg = _(
'Skipping stale conductor cleanup due to invalid '
'configuration: [conductor]conductor_cleanup_timeout '
'(%(cleanup_timeout)s) must be at least 3x '
'[conductor]heartbeat_timeout (%(heartbeat_timeout)s) '
'(recommended minimum: %(min_required)s). This is '
'required to prevent active conductors from being '
'mistakenly removed.') % {
'cleanup_timeout': cleanup_timeout,
'heartbeat_timeout': heartbeat_timeout,
'min_required': min_required}
LOG.warning(error_msg)
return
self._cleanup_stale_conductors(context)
except Exception as e:
LOG.error(
'Encountered error while cleaning up stale conductors: %s', e)
def _cleanup_stale_conductors(self, context):
"""Clean up stale conductors from the database.
:param context: request context.
"""
timeout = CONF.conductor.conductor_cleanup_timeout
batch_size = CONF.conductor.conductor_cleanup_batch_size
# Get conductors that have been offline for longer than the timeout
if not common_utils.is_ironic_using_sqlite():
# For non-SQLite databases, we need to check the updated_at
# timestamp because the database may be shared by multiple
# conductors and we want to avoid deleting records for conductors
# that may still be alive but have not updated their heartbeat
# recently enough.
limit = (timeutils.utcnow() - datetime.timedelta(seconds=timeout))
stale_conductors = self.dbapi.get_offline_conductors()
# Filter by timestamp
conductors_to_delete = []
for hostname in stale_conductors:
try:
conductor = objects.Conductor.get_by_hostname(
context, hostname, online=None)
if conductor.updated_at < limit:
conductors_to_delete.append(hostname)
if len(conductors_to_delete) >= batch_size:
break
except exception.ConductorNotFound:
# Conductor was already deleted, skip
continue
else:
# For SQLite, just get offline conductors
stale_conductors = self.dbapi.get_offline_conductors()
conductors_to_delete = stale_conductors[:batch_size]
if not conductors_to_delete:
return
LOG.info('Cleaning up %(count)d stale conductors: %(conductors)s',
{'count': len(conductors_to_delete),
'conductors': conductors_to_delete})
deleted_count = 0
for hostname in conductors_to_delete:
try:
self.dbapi.delete_conductor(hostname)
deleted_count += 1
except exception.ConductorNotFound:
# Conductor was already deleted by another process
continue
except Exception as e:
LOG.error('Failed to delete conductor %(hostname)s: %(error)s',
{'hostname': hostname, 'error': e})
if deleted_count > 0:
LOG.info('Successfully cleaned up %(count)d stale conductors',
{'count': deleted_count})
def _manage_node_history(self, context):
"""Periodic task to keep the node history tidy."""
max_batch = CONF.conductor.node_history_cleanup_batch_count

View File

@@ -404,6 +404,32 @@ opts = [
'node_history_max_entries setting as users of '
'this setting are anticipated to need to retain '
'history by policy.')),
cfg.IntOpt('conductor_cleanup_interval',
min=0,
default=86400,
mutable=False,
help=_('Interval in seconds at which stale conductor entries '
'can be cleaned up from the database. Setting to 0 '
'disables the periodic task. Defaults to 86400 (1 day).'
)),
cfg.IntOpt('conductor_cleanup_timeout',
min=60,
default=1209600,
mutable=True,
help=_('Timeout in seconds after which offline conductors '
'are considered stale and can be cleaned up from the '
'database. It defaults to two weeks (1209600 seconds) '
'and is always required to be at least 3x larger than '
'[conductor]heartbeat_timeout since if otherwise, '
'active conductors might be mistakenly removed from '
'the database.')),
cfg.IntOpt('conductor_cleanup_batch_size',
min=1,
default=50,
mutable=True,
help=_('The maximum number of stale conductor records to clean '
'up from the database in a single cleanup operation. '
'Defaults to 50.')),
cfg.MultiOpt('verify_step_priority_override',
item_type=types.Dict(),
default={},

View File

@@ -594,6 +594,14 @@ class Connection(object, metaclass=abc.ABCMeta):
:raises: ConductorNotFound
"""
@abc.abstractmethod
def delete_conductor(self, hostname):
"""Delete a conductor from the database.
:param hostname: The hostname of this conductor service.
:raises: ConductorNotFound if the conductor doesn't exist.
"""
@abc.abstractmethod
def touch_conductor(self, hostname, online=True):
"""Mark a conductor as active by updating its 'updated_at' property.

View File

@@ -1457,6 +1457,41 @@ class Connection(api.Connection):
if count == 0:
raise exception.ConductorNotFound(conductor=hostname)
@oslo_db_api.retry_on_deadlock
def delete_conductor(self, hostname):
with _session_for_write() as session:
# Before deleting the conductor, we clear any node
# reservations to prevent nodes from being left in a reserved
# state by a non-existent conductor which could block other
# conductors from managing those nodes.
self.clear_node_reservations_for_conductor(hostname)
# Reset state to prevent transitional or inconsistent states and
# orphaned power state requests after deletion.
self.clear_node_target_power_state(hostname)
# Delete conductor hardware interfaces
query = sa.delete(models.ConductorHardwareInterfaces).where(
models.ConductorHardwareInterfaces.conductor_id == (
session.query(models.Conductor.id).where(
models.Conductor.hostname == hostname
).scalar_subquery()
)
)
session.execute(query)
query = sa.delete(models.Conductor).where(
models.Conductor.hostname == hostname
)
result = session.execute(query)
count = result.rowcount
if count == 0:
raise exception.ConductorNotFound(conductor=hostname)
LOG.info('Deleted conductor with hostname %(hostname)s',
{'hostname': hostname})
@oslo_db_api.retry_on_deadlock
def touch_conductor(self, hostname, online=True):
with _session_for_write() as session:

View File

@@ -158,6 +158,17 @@ class Conductor(base.IronicObject, object_base.VersionedObjectDictCompat):
self.unregister_all_hardware_interfaces()
self.dbapi.unregister_conductor(self.hostname)
@classmethod
def delete(cls, context, hostname):
"""Delete a conductor from the database.
:param cls: the :class:`Conductor`
:param context: Security context
:param hostname: the hostname of the conductor to delete
:raises: ConductorNotFound if the conductor doesn't exist
"""
cls.dbapi.delete_conductor(hostname)
def register_hardware_interfaces(self, interfaces):
"""Register hardware interfaces with the conductor.

View File

@@ -9229,3 +9229,92 @@ class VirtualMediaTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.node.refresh()
self.assertIn("Could not attach device cdrom", self.node.last_error)
self.assertIn("disabled or not implemented", self.node.last_error)
class ConductorCleanupTestCase(mgr_utils.ServiceSetUpMixin,
db_base.DbTestCase):
"""Test stale conductors cleanup."""
def test_cleanup_stale_conductors(self):
"""Test the periodic cleanup of stale conductors."""
self._start_service()
self.conductor1 = self.dbapi.register_conductor({
'hostname': 'conductor-1',
'drivers': ['fake'],
'conductor_group': 'group1'
})
self.conductor2 = self.dbapi.register_conductor({
'hostname': 'conductor-2',
'drivers': ['fake'],
'conductor_group': 'group2'
})
self.dbapi.touch_conductor('conductor-1', online=False)
with mock.patch.object(self.dbapi, 'get_offline_conductors',
return_value=['conductor-1'], autospec=True):
self.service._cleanup_stale_conductors(self.context)
self.assertRaises(exception.ConductorNotFound,
self.dbapi.get_conductor, 'conductor-1')
conductor2 = self.dbapi.get_conductor('conductor-2')
self.assertEqual('conductor-2', conductor2.hostname)
def test_cleanup_stale_conductors_disabled(self):
"""Test that cleanup is disabled when configured to do so."""
self._start_service()
conductors = []
for i in range(5):
self.dbapi.register_conductor({
'hostname': 'conductor-%d' % i,
'drivers': ['fake'],
'conductor_group': 'group%d' % i
})
self.dbapi.touch_conductor('conductor-%d' % i, online=False)
conductors.append('conductor-%d' % i)
self.config(conductor_cleanup_batch_size=2, group='conductor')
with mock.patch.object(self.dbapi, 'get_offline_conductors',
return_value=conductors, autospec=True):
with mock.patch.object(self.dbapi, 'delete_conductor',
autospec=True) as mock_delete:
self.service._cleanup_stale_conductors(self.context)
self.assertEqual(2, mock_delete.call_count)
def _run_cleanup_stale_conductors_and_check_warning(
self, heartbeat_timeout, conductor_cleanup_timeout,
expected_warning_substring=None):
self._start_service()
self.config(heartbeat_timeout=heartbeat_timeout, group='conductor')
self.config(conductor_cleanup_timeout=conductor_cleanup_timeout,
group='conductor')
self.config(conductor_cleanup_batch_size=2, group='conductor')
with mock.patch.object(self.dbapi, 'get_offline_conductors',
return_value=[], autospec=True):
with mock.patch('ironic.conductor.manager.LOG',
autospec=True) as mock_log:
self.service.cleanup_stale_conductors(self.context)
if expected_warning_substring is None:
mock_log.warning.assert_not_called()
else:
mock_log.warning.assert_called_once()
self.assertIn(expected_warning_substring,
mock_log.warning.call_args[0][0])
def test_cleanup_stale_conductors_configs_validation(self):
test_cases = [
(60, 180, None), # valid: exactly 3x
(60, 200, None), # valid: greater than 3x
(0, 180, 'heartbeat_timeout is invalid'),
(60, 120, 'conductor_cleanup_timeout (120) must be at least 3x'),
]
for heartbeat_timeout, cleanup_timeout, expected_warning in test_cases:
self._run_cleanup_stale_conductors_and_check_warning(
heartbeat_timeout=heartbeat_timeout,
conductor_cleanup_timeout=cleanup_timeout,
expected_warning_substring=expected_warning
)

View File

@@ -498,3 +498,60 @@ class DbConductorTestCase(base.DbTestCase):
# default for all unit tests running.
result = self.dbapi.list_hardware_type_interfaces([ht1, ht2])
self.assertEqual(4, len(result))
def test_delete_conductor(self):
self.dbapi.register_conductor(
{'hostname': 'test-conductor',
'drivers': ['fake'],
'conductor_group': 'test-group'})
self.dbapi.get_conductor('test-conductor')
self.dbapi.delete_conductor('test-conductor')
self.assertRaises(exception.ConductorNotFound,
self.dbapi.get_conductor, 'test-conductor')
def test_delete_conductor_not_found(self):
self.assertRaises(exception.ConductorNotFound,
self.dbapi.delete_conductor,
'fake-hostname')
def test_delete_conductor_clears_reservations(self):
self.dbapi.register_conductor(
{'hostname': 'test-conductor',
'drivers': ['fake'],
'conductor_group': 'test-group'})
node = self.dbapi.create_node({
'uuid': 'test-node-uuid',
'driver': 'fake',
'reservation': 'test-conductor'
})
self.assertEqual('test-conductor', node.reservation)
self.dbapi.delete_conductor('test-conductor')
node = self.dbapi.get_node_by_uuid('test-node-uuid')
self.assertIsNone(node.reservation)
def test_delete_conductor_clears_hardware_interfaces(self):
conductor = self.dbapi.register_conductor(
{'hostname': 'test-conductor',
'drivers': ['fake'],
'conductor_group': 'test-group'})
self.dbapi.register_conductor_hardware_interfaces(
conductor['id'],
[{'hardware_type': 'fake',
'interface_type': 'deploy',
'interface_name': 'direct',
'default': True}])
interfaces = self.dbapi.list_conductor_hardware_interfaces(
conductor['id'])
self.assertEqual(1, len(interfaces))
self.dbapi.delete_conductor('test-conductor')
interfaces = self.dbapi.list_conductor_hardware_interfaces(
conductor['id'])
self.assertEqual([], interfaces)

View File

@@ -0,0 +1,32 @@
---
features:
- |
Adds periodic cleanup of stale conductor entries from the database.
A new periodic task automatically removes conductor records that have
been offline for longer than the configured timeout period, helping
prevent accumulation of stale conductor entries.
Three new configuration options have been added to the ``[conductor]``
section:
* ``conductor_cleanup_interval`` - The interval in seconds of how often
to run the cleanup task (default: 86400 seconds - 1 day).
For example: 86400 seconds = 1 day, 604800 seconds = 1 week,
2592000 seconds = 30 days (approx. 1 month).
* ``conductor_cleanup_timeout`` - How long a conductor must be offline
before it's considered stale and eligible for cleanup
(default: 1209600 seconds - 2 weeks). This value is always required to be
at least 3x larger than ``[conductor]heartbeat_timeout`` since if
otherwise, active conductors might be mistakenly removed from the
database. The cleanup task will skip execution and log a warning if
this requirement is not met.
* ``conductor_cleanup_batch_size`` - Maximum number of stale conductors
to clean up in a single operation (default: 50).
fixes:
- |
[`Bug 2069771 <https://bugs.launchpad.net/ironic/+bug/2069771>`_]
Fixes an issue where deleted or decommissioned conductors would remain
in ``openstack baremetal conductor list`` indefinitely with
``Alive = False`` status. The new periodic cleanup task automatically
removes these stale entries after the configured timeout period.