Fix removing stale metrics from Prometheus exporter

The Prometheus exporter tries to remove stale metrics every polling
cycle so VMs that have removed from the node do not leave metrics there.
This works fine when there are multiple VMs in the node, but if the
last VM is removed from the node, its metrics survive in the exporter
forever.

This is due Ceilometer not running the pollsters when there are no VMs
available to collect metrics and the current code running the cleanup on
the pollster code.

This fix moves the existent cleanup code so it can run even if the
pollster does not execute due to no VMs available, which removes stale
metrics in every case.

Change-Id: I8394c71a78f9b0004514fbb624ac7436d3c60e61
Signed-off-by: jlarriba <jlarriba@redhat.com>
This commit is contained in:
jlarriba
2025-09-24 13:57:35 +02:00
parent 594d3051c6
commit 6ca5d3ea76
3 changed files with 45 additions and 10 deletions

View File

@@ -312,6 +312,9 @@ class PollingTask:
polling_resources.append(x)
poll_history[pollster.name] = history
if self.manager.conf.polling.enable_prometheus_exporter:
prom_exporter.purge_stale_metrics(pollster.name)
# If no resources, skip for this pollster
if not polling_resources:
p_context = 'new' if history else ''

View File

@@ -28,22 +28,12 @@ def export(prom_iface, prom_port, tls_cert=None, tls_key=None):
def collect_metrics(samples):
metric_cleared = False
for sample in samples:
name = "ceilometer_" + sample['counter_name'].replace('.', '_')
labels = _gen_labels(sample)
metric = CEILOMETER_REGISTRY._names_to_collectors.get(name, None)
# NOTE: Ungregister the metric at the first iteration to purge stale
# samples
if not metric_cleared:
if metric:
CEILOMETER_REGISTRY.unregister(metric)
metric = None
metric_cleared = True
if metric is None:
metric = prom.Gauge(name=name, documentation="",
labelnames=labels['keys'],
@@ -51,6 +41,18 @@ def collect_metrics(samples):
metric.labels(*labels['values']).set(sample['counter_volume'])
def purge_stale_metrics(pollster):
metric_cleared = False
metric_name = "ceilometer_" + pollster.replace('.', '_')
metric = CEILOMETER_REGISTRY._names_to_collectors.get(metric_name, None)
if not metric_cleared:
if metric:
CEILOMETER_REGISTRY.unregister(metric)
metric = None
metric_cleared = True
def _gen_labels(sample):
labels = dict(keys=[], values=[])
cNameShards = sample['counter_name'].split(".")

View File

@@ -406,3 +406,33 @@ class TestPromExporter(base.BaseTestCase):
'cirros2', 'server_group123']
label3 = prom_exporter._gen_labels(self.test_image_size[0])
self.assertDictEqual(label3, slabels3)
@mock.patch.object(prom_exporter.CEILOMETER_REGISTRY, 'unregister')
def test_purge_stale_metrics_existing_metric(self, mock_unregister):
mock_metric = mock.MagicMock()
prom_exporter.CEILOMETER_REGISTRY._names_to_collectors = {
'ceilometer_test_metric': mock_metric
}
prom_exporter.purge_stale_metrics('test.metric')
mock_unregister.assert_called_once_with(mock_metric)
@mock.patch.object(prom_exporter.CEILOMETER_REGISTRY, 'unregister')
def test_purge_stale_metrics_no_existing_metric(self, mock_unregister):
prom_exporter.CEILOMETER_REGISTRY._names_to_collectors = {}
prom_exporter.purge_stale_metrics('nonexistent.metric')
mock_unregister.assert_not_called()
@mock.patch.object(prom_exporter.CEILOMETER_REGISTRY, 'unregister')
def test_purge_stale_metrics_name_transformation(self, mock_unregister):
mock_metric = mock.MagicMock()
prom_exporter.CEILOMETER_REGISTRY._names_to_collectors = {
'ceilometer_cpu_util': mock_metric
}
prom_exporter.purge_stale_metrics('cpu.util')
mock_unregister.assert_called_once_with(mock_metric)