From a65e7e9b59451cc1deb905df935ecb7900eef339 Mon Sep 17 00:00:00 2001 From: Alfredo Moralejo Date: Mon, 17 Mar 2025 19:06:28 +0100 Subject: [PATCH] Query by fqdn_label instead of instance for host metrics Currently we are using `instance` label to query about host metrics to prometheus. This label is assigned to the url of each endpoint being scrapped. While this work fine in one-exporter-per-compute cases as the driver is mapping the fqdn_label value to the `instance` label value, it fails when there are more that one target with the same value for the fqdn label. This is a valid case, to be able to query by fqdn and do not care about what exporter in the host is providing the metric. This patch is changing the queries we use for hosts to be based on the fqdn_label instead of the instance one. To implement it, we are also simplifying the way we check the metric exist for the host by converting prometheus_fqdn_instance_map into a prometheus_fqdn_labels set which stores the list of fqdn found in prometheus. Closes-Bug: #2103451 Change-Id: I3bcc317441b73da5c876e53edd4622370c6d575e --- doc/source/datasources/prometheus.rst | 18 +-- ...with-multiple-target-0e65d20711d1abe2.yaml | 9 ++ .../decision_engine/datasources/prometheus.py | 114 ++++++++-------- .../datasources/test_prometheus_helper.py | 122 ++++++++++++++---- 4 files changed, 173 insertions(+), 90 deletions(-) create mode 100644 releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml diff --git a/doc/source/datasources/prometheus.rst b/doc/source/datasources/prometheus.rst index be134e372..2308167a7 100644 --- a/doc/source/datasources/prometheus.rst +++ b/doc/source/datasources/prometheus.rst @@ -29,19 +29,19 @@ This default can be overridden when a deployer uses a different label to identify the exporter host (for example ``hostname`` or ``host``, or any other label, as long as it identifies the host). -Internally this label is used in creating a ``fqdn_instance_map``, mapping -the fqdn with the Prometheus instance label associated with each exporter. -The keys of the resulting fqdn_instance_map are expected to match the +Internally this label is used in creating ``fqdn_instance_labels``, containing +the list of values assigned to the the label in the Prometheus targets. +The elements of the resulting fqdn_instance_labels are expected to match the ``ComputeNode.hostname`` used in the Watcher decision engine cluster model. -An example ``fqdn_instance_map`` is the following: +An example ``fqdn_instance_labels`` is the following: .. code-block:: - { - 'ena.controlplane.domain': '10.1.2.1:9100', - 'dio.controlplane.domain': '10.1.2.2:9100', - 'tria.controlplane.domain': '10.1.2.3:9100' - } + [ + 'ena.controlplane.domain', + 'dio.controlplane.domain', + 'tria.controlplane.domain', + ] For instance metrics, it is required that Prometheus contains a label with the uuid of the OpenStack instance in each relevant metric. By default, diff --git a/releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml b/releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml new file mode 100644 index 000000000..c96ce6718 --- /dev/null +++ b/releasenotes/notes/bug-2103451-fixes-prometheus-queries-with-multiple-target-0e65d20711d1abe2.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + When using prometheus datasource and more that one target has the same value + for the `fqdn_label`, the driver used the wrong instance label to query for host + metrics. The `instance` label is no longer used in the queries but the `fqdn_label` + which identifies all the metrics for a specific compute node. + + .. _Bug 2103451: https://bugs.launchpad.net/watcher/+bug/2103451 diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py index 7cac8922d..9b803b6be 100644 --- a/watcher/decision_engine/datasources/prometheus.py +++ b/watcher/decision_engine/datasources/prometheus.py @@ -51,10 +51,10 @@ class PrometheusHelper(base.DataSourceBase): The prometheus helper uses the PrometheusAPIClient provided by python-observabilityclient. - The prometheus_fqdn_instance_map maps the fqdn of each node to the - Prometheus instance label added to all metrics on that node. When - making queries to Prometheus we use the instance label to specify - the node for which metrics are to be retrieved. + The prometheus_fqdn_labels contains a list the values contained in + the fqdn_label in the Prometheus instance. When making queries to + Prometheus we use the fqdn_label to specify the node for which + metrics are to be retrieved. host, port and fqdn_label come from watcher_client config. The prometheus_fqdn_label allows override of the required label in Prometheus scrape configs that specifies each target's fqdn. @@ -63,8 +63,8 @@ class PrometheusHelper(base.DataSourceBase): self.prometheus_fqdn_label = ( CONF.prometheus_client.fqdn_label ) - self.prometheus_fqdn_instance_map = ( - self._build_prometheus_fqdn_instance_map() + self.prometheus_fqdn_labels = ( + self._build_prometheus_fqdn_labels() ) self.prometheus_host_instance_map = ( self._build_prometheus_host_instance_map() @@ -136,73 +136,71 @@ class PrometheusHelper(base.DataSourceBase): return the_client - def _build_prometheus_fqdn_instance_map(self): - """Build the fqdn<-->instance_label mapping needed for queries + def _build_prometheus_fqdn_labels(self): + """Build the list of fqdn_label values to be used in host queries Watcher knows nodes by their hostname. In Prometheus however the scrape targets (also known as 'instances') are specified by I.P. - (or hostname) and port number. This function creates a mapping between - the fully qualified domain name of each node and the corresponding - instance label used in the scrape config. This relies on a custom - 'fqdn' label added to Prometheus scrape_configs. Operators can use - a different custom label instead by setting the prometheus_fqdn_label + (or hostname) and port number and fqdn is stored in a custom 'fqdn' + label added to Prometheus scrape_configs. Operators can use a + different custom label instead by setting the prometheus_fqdn_label config option under the prometheus_client section of watcher config. - The built prometheus_fqdn_instance_map is used to match watcher - node.hostname if watcher stores fqdn and otherwise the - host_instance_map is used instead. - :return a dict mapping fqdn to instance label. For example: - {'marios-env-again.controlplane.domain': '10.1.2.3:9100'} + The built prometheus_fqdn_labels is created with the full list + of values of the prometheus_fqdn_label label in Prometheus. This will + be used to create a map of hostname<-->fqdn and to identify if a target + exist in prometheus for the compute nodes before sending the query. + :return a set of values of the fqdn label. For example: + {'foo.example.com', 'bar.example.com'} + {'foo', 'bar'} """ prometheus_targets = self.prometheus._get( "targets?state=active")['data']['activeTargets'] # >>> prometheus_targets[0]['labels'] # {'fqdn': 'marios-env-again.controlplane.domain', # 'instance': 'localhost:9100', 'job': 'node'} - fqdn_instance_map = { - fqdn: instance for (fqdn, instance) in ( - (target['labels'].get(self.prometheus_fqdn_label), - target['labels'].get('instance')) - for target in prometheus_targets - if target.get('labels', {}).get(self.prometheus_fqdn_label) - ) - } - if not fqdn_instance_map: + fqdn_instance_labels = set() + for target in prometheus_targets: + if target.get('labels', {}).get(self.prometheus_fqdn_label): + fqdn_instance_labels.add( + target['labels'].get(self.prometheus_fqdn_label)) + + if not fqdn_instance_labels: LOG.error( - "Could not create fqdn instance map from Prometheus " + "Could not create fqdn labels list from Prometheus " "targets config. Prometheus returned the following: %s", prometheus_targets ) - return {} - return fqdn_instance_map + return set() + return fqdn_instance_labels def _build_prometheus_host_instance_map(self): """Build the hostname<-->instance_label mapping needed for queries - The prometheus_fqdn_instance_map has the fully qualified domain name + The prometheus_fqdn_labels has the fully qualified domain name for hosts. This will create a duplicate map containing only the host name part. Depending on the watcher node.hostname either the - fqdn_instance_map or the host_instance_map will be used to resolve - the correct prometheus instance label for queries. In the event the - fqdn_instance_map keys are not valid fqdn (for example it contains + fqdn_instance_labels or the host_instance_map will be used to resolve + the correct prometheus fqdn_label for queries. In the event the + fqdn_instance_labels elements are not valid fqdn (for example it has hostnames, not fqdn) the host_instance_map cannot be created and an empty dictionary is returned with a warning logged. :return a dict mapping hostname to instance label. For example: - {'marios-env-again': 'localhost:9100'} + {'foo': 'foo.example.com', 'bar': 'bar.example.com'} """ - if not self.prometheus_fqdn_instance_map: + if not self.prometheus_fqdn_labels: LOG.error("Cannot build host_instance_map without " - "fqdn_instance_map") + "fqdn_instance_labels") return {} host_instance_map = { - host: instance for (host, instance) in ( - (fqdn.split('.')[0], inst) - for fqdn, inst in self.prometheus_fqdn_instance_map.items() + host: fqdn for (host, fqdn) in ( + (fqdn.split('.')[0], fqdn) + for fqdn in self.prometheus_fqdn_labels if '.' in fqdn ) } if not host_instance_map: LOG.warning("Creating empty host instance map. Are the keys " - "in prometheus_fqdn_instance_map valid fqdn?") + "in prometheus_fqdn_labels valid fqdn?") return {} return host_instance_map @@ -210,23 +208,25 @@ class PrometheusHelper(base.DataSourceBase): """Resolve the prometheus instance label to use in queries Given the watcher node.hostname, resolve the prometheus instance - label for use in queries, first trying the fqdn_instance_map and + label for use in queries, first trying the fqdn_instance_labels and then the host_instance_map (watcher.node_name can be fqdn or hostname). If the name is not resolved after the first attempt, rebuild the fqdn and host instance maps and try again. This allows for new hosts added - after the initialisation of the fqdn_instance_map. + after the initialisation of the fqdn_instance_labels. :param node_name: the watcher node.hostname :return String for the prometheus instance label and None if not found """ def _query_maps(node): - return self.prometheus_fqdn_instance_map.get( - node, self.prometheus_host_instance_map.get(node, None)) + if node in self.prometheus_fqdn_labels: + return node + else: + return self.prometheus_host_instance_map.get(node, None) instance_label = _query_maps(node_name) # refresh the fqdn and host instance maps and retry if not instance_label: - self.prometheus_fqdn_instance_map = ( - self._build_prometheus_fqdn_instance_map() + self.prometheus_fqdn_labels = ( + self._build_prometheus_fqdn_labels() ) self.prometheus_host_instance_map = ( self._build_prometheus_host_instance_map() @@ -307,17 +307,21 @@ class PrometheusHelper(base.DataSourceBase): if meter == 'node_cpu_seconds_total': query_args = ( - "100 - (%s by (instance)(rate(%s" - "{mode='idle',instance='%s'}[%ss])) * 100)" % - (aggregate, meter, instance_label, period) + "100 - (%(agg)s by (instance)(rate(%(meter)s" + "{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) " + "* 100)" + % {'label': self.prometheus_fqdn_label, + 'label_value': instance_label, 'agg': aggregate, + 'meter': meter, 'period': period} ) elif meter == 'node_memory_MemAvailable_bytes': query_args = ( - "(node_memory_MemTotal_bytes{instance='%s'} " - "- %s_over_time(%s{instance='%s'}[%ss])) " - "/ 1024 / 1024" % - (instance_label, aggregate, meter, - instance_label, period) + "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} " + "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}" + "[%(period)ss])) / 1024 / 1024" + % {'label': self.prometheus_fqdn_label, + 'label_value': instance_label, 'agg': aggregate, + 'meter': meter, 'period': period} ) elif meter == 'ceilometer_memory_usage': query_args = ( diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py index c96688e9a..1c4b0d662 100644 --- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py +++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py @@ -147,7 +147,8 @@ class TestPrometheusHelper(base.BaseTestCase): self.assertEqual(expected_cpu_usage, result) mock_prometheus_query.assert_called_once_with( "100 - (avg by (instance)(rate(node_cpu_seconds_total" - "{mode='idle',instance='10.0.1.2:9100'}[300s])) * 100)") + "{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))" + " * 100)") @mock.patch.object(prometheus_client.PrometheusAPIClient, 'query') @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') @@ -437,15 +438,48 @@ class TestPrometheusHelper(base.BaseTestCase): 'instance': '10.1.2.3:9100', 'job': 'node', }}, ]}} - expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100', - 'bar.controlplane.domain': '10.1.2.2:9100', - 'baz.controlplane.domain': '10.1.2.3:9100'} - expected_host_map = {'foo': '10.1.2.1:9100', - 'bar': '10.1.2.2:9100', - 'baz': '10.1.2.3:9100'} + expected_fqdn_list = {'foo.controlplane.domain', + 'bar.controlplane.domain', + 'baz.controlplane.domain'} + expected_host_map = {'foo': 'foo.controlplane.domain', + 'bar': 'bar.controlplane.domain', + 'baz': 'baz.controlplane.domain'} helper = prometheus_helper.PrometheusHelper() - self.assertEqual(helper.prometheus_fqdn_instance_map, - expected_fqdn_map) + self.assertEqual(helper.prometheus_fqdn_labels, + expected_fqdn_list) + self.assertEqual(helper.prometheus_host_instance_map, + expected_host_map) + + @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') + def test_build_prometheus_fqdn_host_instance_map_dupl_fqdn( + self, mock_prometheus_get): + mock_prometheus_get.return_value = {'data': {'activeTargets': [ + {'labels': { + 'fqdn': 'foo.controlplane.domain', + 'instance': '10.1.2.1:9100', 'job': 'node', + }}, + {'labels': { + 'fqdn': 'foo.controlplane.domain', + 'instance': '10.1.2.1:9229', 'job': 'podman', + }}, + {'labels': { + 'fqdn': 'bar.controlplane.domain', + 'instance': '10.1.2.2:9100', 'job': 'node', + }}, + {'labels': { + 'fqdn': 'baz.controlplane.domain', + 'instance': '10.1.2.3:9100', 'job': 'node', + }}, + ]}} + expected_fqdn_list = {'foo.controlplane.domain', + 'bar.controlplane.domain', + 'baz.controlplane.domain'} + expected_host_map = {'foo': 'foo.controlplane.domain', + 'bar': 'bar.controlplane.domain', + 'baz': 'baz.controlplane.domain'} + helper = prometheus_helper.PrometheusHelper() + self.assertEqual(helper.prometheus_fqdn_labels, + expected_fqdn_list) self.assertEqual(helper.prometheus_host_instance_map, expected_host_map) @@ -460,7 +494,7 @@ class TestPrometheusHelper(base.BaseTestCase): }}, ]}} helper = prometheus_helper.PrometheusHelper() - self.assertEqual({}, helper.prometheus_fqdn_instance_map) + self.assertEqual(set(), helper.prometheus_fqdn_labels) self.assertEqual({}, helper.prometheus_host_instance_map) @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') @@ -476,12 +510,29 @@ class TestPrometheusHelper(base.BaseTestCase): }}, ]}} helper = prometheus_helper.PrometheusHelper() - expected_fqdn_map = {'ena': '10.1.2.1:9100', - 'dyo': '10.1.2.2:9100'} + expected_fqdn_list = {'ena', 'dyo'} self.assertEqual( - helper.prometheus_fqdn_instance_map, expected_fqdn_map) + helper.prometheus_fqdn_labels, expected_fqdn_list) self.assertEqual({}, helper.prometheus_host_instance_map) + @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') + def test_using_ips_not_fqdn(self, mock_prometheus_get): + mock_prometheus_get.return_value = {'data': {'activeTargets': [ + {'labels': { + 'ip_label': '10.1.2.1', + 'instance': '10.1.2.1:9100', 'job': 'node', + }}, + {'labels': { + 'ip_label': '10.1.2.2', + 'instance': '10.1.2.2:9100', 'job': 'node', + }}, + ]}} + cfg.CONF.prometheus_client.fqdn_label = 'ip_label' + helper = prometheus_helper.PrometheusHelper() + expected_fqdn_list = {'10.1.2.1', '10.1.2.2'} + self.assertEqual( + helper.prometheus_fqdn_labels, expected_fqdn_list) + @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') def test_override_prometheus_fqdn_label(self, mock_prometheus_get): mock_prometheus_get.return_value = {'data': {'activeTargets': [ @@ -494,19 +545,19 @@ class TestPrometheusHelper(base.BaseTestCase): 'instance': '10.1.2.2:9100', 'job': 'node', }}, ]}} - expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100', - 'bar.controlplane.domain': '10.1.2.2:9100'} - expected_host_map = {'foo': '10.1.2.1:9100', - 'bar': '10.1.2.2:9100'} + expected_fqdn_list = {'foo.controlplane.domain', + 'bar.controlplane.domain'} + expected_host_map = {'foo': 'foo.controlplane.domain', + 'bar': 'bar.controlplane.domain'} cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label' helper = prometheus_helper.PrometheusHelper() - self.assertEqual(helper.prometheus_fqdn_instance_map, - expected_fqdn_map) + self.assertEqual(helper.prometheus_fqdn_labels, + expected_fqdn_list) self.assertEqual(helper.prometheus_host_instance_map, expected_host_map) def test_resolve_prometheus_instance_label(self): - expected_instance_label = '10.0.1.2:9100' + expected_instance_label = 'marios-env.controlplane.domain' result = self.helper._resolve_prometheus_instance_label( 'marios-env.controlplane.domain') self.assertEqual(result, expected_instance_label) @@ -525,7 +576,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_cpu_avg_agg(self): expected_query = ( "100 - (avg by (instance)(rate(node_cpu_seconds_total" - "{mode='idle',instance='a_host'}[111s])) * 100)") + "{mode='idle',fqdn='a_host'}[111s])) * 100)") result = self.helper._build_prometheus_query( 'avg', 'node_cpu_seconds_total', 'a_host', '111') self.assertEqual(result, expected_query) @@ -533,15 +584,15 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_cpu_max_agg(self): expected_query = ( "100 - (max by (instance)(rate(node_cpu_seconds_total" - "{mode='idle',instance='b_host'}[444s])) * 100)") + "{mode='idle',fqdn='b_host'}[444s])) * 100)") result = self.helper._build_prometheus_query( 'max', 'node_cpu_seconds_total', 'b_host', '444') self.assertEqual(result, expected_query) def test_build_prometheus_query_node_memory_avg_agg(self): expected_query = ( - "(node_memory_MemTotal_bytes{instance='c_host'} - avg_over_time" - "(node_memory_MemAvailable_bytes{instance='c_host'}[555s])) " + "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time" + "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) " "/ 1024 / 1024") result = self.helper._build_prometheus_query( 'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555') @@ -549,8 +600,27 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_node_memory_min_agg(self): expected_query = ( - "(node_memory_MemTotal_bytes{instance='d_host'} - min_over_time" - "(node_memory_MemAvailable_bytes{instance='d_host'}[222s])) " + "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time" + "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) " + "/ 1024 / 1024") + result = self.helper._build_prometheus_query( + 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') + self.assertEqual(result, expected_query) + + def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self): + self.helper.prometheus_fqdn_label = 'custom_fqdn_label' + expected_query = ( + "100 - (avg by (instance)(rate(node_cpu_seconds_total" + "{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)") + result = self.helper._build_prometheus_query( + 'avg', 'node_cpu_seconds_total', 'a_host', '111') + self.assertEqual(result, expected_query) + + def test_build_prometheus_query_node_memory_min_agg_custom_label(self): + self.helper.prometheus_fqdn_label = 'custom_fqdn' + expected_query = ( + "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time" + "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) " "/ 1024 / 1024") result = self.helper._build_prometheus_query( 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')