From fc5afdacf26d9d0a340f025853beaab53bf54f2b Mon Sep 17 00:00:00 2001 From: Kyle MacLeod Date: Mon, 17 Jan 2022 17:10:36 -0500 Subject: [PATCH] Refactor portion of dcmanager into dcmanager-state process This change breaks down the monolithic 'dcmanager' process into a separate state-handling multi-process architecture. Subcloud state management is handled by the new state processes. This is a performance optimization, increasing the throughput of subcloud state change operations during dcmanager audits. It also optimizes performance handling of state updates from dcorch. Refactoring. These top-level methods are moved from dcmanager SubcloudManager to SubcloudStateManager (along with associated RPC/service APIs above): - update_subcloud_endpoint_status - update_subcloud_availability - plus internal methods, including the subcloud-level synchronization block New service: DCManagerStateService() - distributedcloud/dcmanager/state/service.py - overriding max_pool_size, max_overflow values from /etc/dcmanager/dcmanager.conf New state manager: SubcloudStateManager() - distributedcloud/dcmanager/state/subcloud_state_manager.py New RPC client: SubcloudStateClient() - Added to existing distributedcloud/dcmanager/rpc/client.py New RPC topic: - TOPIC_DC_MANAGER_STATE = "dcmanager-state" SM service changes: - distributedcloud/dcmanager/cmd/state.py - distributedcloud/ocf/dcmanager-state Other notable changes (from testing): - Cleanup: unused ThreadGroupManager removed from dcmananager service.py - generic_sync_manager: Add eventlet yield during subcloud processing during initialization - dcorch: fix exceptions on shutdown due to race on threadgroup shutdown - dcorch: log service startup with worker config Test cases: - update test cases where necessary to used the newly refactored code - since there is no new logic, no additional tests are required Test Plan: PASS: - Test various audit scenarios in small virtualized environment - Test audit scenarios in lab setting with a large number of subclouds - subclouds going offline/online (including flooding) - dead office recovery - swact - system soak - Validate dcmanager-state service lifecycle and dependencies Story: 2009725 Task: 44317 Change-Id: I2c9a0f84e8cf638632ca319545e9e93e6f43f263 Signed-off-by: Kyle MacLeod --- distributedcloud/centos/distributedcloud.spec | 3 + .../centos/files/distcloud-syslog.conf | 3 + .../dcmanager/api/controllers/v1/subclouds.py | 25 +- distributedcloud/dcmanager/audit/auditor.py | 6 +- .../dcmanager/audit/firmware_audit.py | 6 +- .../audit/kube_rootca_update_audit.py | 4 +- .../dcmanager/audit/kubernetes_audit.py | 6 +- .../dcmanager/audit/patch_audit.py | 6 +- distributedcloud/dcmanager/audit/service.py | 11 +- .../audit/subcloud_audit_worker_manager.py | 15 +- distributedcloud/dcmanager/cmd/state.py | 69 +++ distributedcloud/dcmanager/common/config.py | 5 + distributedcloud/dcmanager/common/consts.py | 4 +- distributedcloud/dcmanager/manager/service.py | 65 --- .../dcmanager/manager/subcloud_manager.py | 442 +-------------- .../states/firmware/finishing_fw_update.py | 4 +- distributedcloud/dcmanager/rpc/client.py | 72 ++- distributedcloud/dcmanager/state/README.rst | 12 + distributedcloud/dcmanager/state/__init__.py | 0 distributedcloud/dcmanager/state/service.py | 166 ++++++ .../dcmanager/state/subcloud_state_manager.py | 522 ++++++++++++++++++ .../unit/api/v1/controllers/test_subclouds.py | 15 +- .../unit/audit/test_firmware_audit_manager.py | 51 +- .../unit/audit/test_kube_audit_manager.py | 39 +- .../unit/audit/test_patch_audit_manager.py | 38 +- .../test_subcloud_audit_worker_manager.py | 36 +- .../tests/unit/manager/test_service.py | 7 - .../unit/manager/test_subcloud_manager.py | 68 ++- .../dcorch/api/proxy/apps/controller.py | 4 +- .../dcorch/api/proxy/apps/patch.py | 4 +- distributedcloud/dcorch/cmd/engine.py | 3 + .../dcorch/engine/generic_sync_manager.py | 1 + distributedcloud/dcorch/engine/service.py | 12 +- distributedcloud/dcorch/engine/sync_thread.py | 4 +- distributedcloud/ocf/dcmanager-state | 327 +++++++++++ distributedcloud/setup.cfg | 1 + 36 files changed, 1368 insertions(+), 688 deletions(-) create mode 100644 distributedcloud/dcmanager/cmd/state.py create mode 100755 distributedcloud/dcmanager/state/README.rst create mode 100644 distributedcloud/dcmanager/state/__init__.py create mode 100644 distributedcloud/dcmanager/state/service.py create mode 100644 distributedcloud/dcmanager/state/subcloud_state_manager.py create mode 100644 distributedcloud/ocf/dcmanager-state diff --git a/distributedcloud/centos/distributedcloud.spec b/distributedcloud/centos/distributedcloud.spec index 724bf02c7..a66da3e50 100644 --- a/distributedcloud/centos/distributedcloud.spec +++ b/distributedcloud/centos/distributedcloud.spec @@ -134,6 +134,7 @@ install -d -m 755 %{buildroot}%{_tmpfilesdir} install -d -m 755 %{buildroot}/var/log/dcmanager install -d -m 755 %{buildroot}/var/cache/dcmanager install -d -m 755 %{buildroot}%{_sysconfdir}/dcmanager/ +# TODO(kmacleod) Remove systemd unit files, they are not used: # install systemd unit files install -p -D -m 644 %{SOURCE1} %{buildroot}%{_unitdir}/dcmanager-api.service install -p -D -m 644 %{SOURCE2} %{buildroot}%{_unitdir}/dcmanager-manager.service @@ -151,6 +152,7 @@ install -p -D -m 644 %{SOURCE16} %{buildroot}%{_sysconfdir}/logrotate.d/distclou install -d -m 755 %{buildroot}/var/log/dcorch install -d -m 755 %{buildroot}/var/cache/dcorch install -d -m 755 %{buildroot}%{_sysconfdir}/dcorch/ +# TODO(kmacleod) Remove systemd unit files, they are not used: # install systemd unit files install -p -D -m 644 %{SOURCE3} %{buildroot}%{_unitdir}/dcorch-api.service install -p -D -m 644 %{SOURCE4} %{buildroot}%{_unitdir}/dcorch-engine.service @@ -208,6 +210,7 @@ install -m 755 -D -p %{SOURCE12} %{buildroot}/%{_bindir}/clean-dcorch %{_bindir}/dcmanager-manager %{_unitdir}/dcmanager-manager.service %{_bindir}/dcmanager-manage +%{_bindir}/dcmanager-state %{_tmpfilesdir}/dcmanager.conf %dir %attr(0755,root,root) %{_localstatedir}/log/dcmanager %dir %attr(0755,root,root) %{_localstatedir}/cache/dcmanager diff --git a/distributedcloud/centos/files/distcloud-syslog.conf b/distributedcloud/centos/files/distcloud-syslog.conf index 400873a64..54ef995d5 100644 --- a/distributedcloud/centos/files/distcloud-syslog.conf +++ b/distributedcloud/centos/files/distcloud-syslog.conf @@ -1,6 +1,7 @@ # Distributed Cloud Log destination destination d_dcmanager { file("/var/log/dcmanager/dcmanager.log" template(t_preformatted)); }; destination d_dcmanager_audit { file("/var/log/dcmanager/audit.log" template(t_preformatted)); }; +destination d_dcmanager_state { file("/var/log/dcmanager/state.log" template(t_preformatted)); }; destination d_dcmanager_orch { file("/var/log/dcmanager/orchestrator.log" template(t_preformatted)); }; destination d_dcorch { file("/var/log/dcorch/dcorch.log" template(t_preformatted)); }; destination d_dcdbsync { file("/var/log/dcdbsync/dcdbsync.log" template(t_preformatted)); }; @@ -9,6 +10,7 @@ destination d_dcdbsync_openstack { file("/var/log/dcdbsync/dcdbsync_openstack.lo # Distributed Cloud Log Filters filter f_dcmanagermanager { facility(local4) and program(dcmanager-manager); }; filter f_dcmanageraudit { facility(local4) and program(dcmanager-audit); }; +filter f_dcmanagerstate { facility(local4) and program(dcmanager-state); }; filter f_dcmanagerorchestrator { facility(local4) and program(dcmanager-orchestrator); }; filter f_dcmanagerapi { facility(local4) and program(dcmanager-api); }; @@ -21,6 +23,7 @@ filter f_dcdbsyncopenstackapi { facility(local4) and program(dcdbsync-api); } # Distributed Cloud Log Path log {source(s_src); filter(f_dcmanagermanager); destination(d_dcmanager); }; log {source(s_src); filter(f_dcmanageraudit); destination(d_dcmanager_audit); }; +log {source(s_src); filter(f_dcmanagerstate); destination(d_dcmanager_state); }; log {source(s_src); filter(f_dcmanagerorchestrator); destination(d_dcmanager_orch); }; log {source(s_src); filter(f_dcmanagerapi); destination(d_dcmanager); }; log {source(s_src); filter(f_dcorchengine); destination(d_dcorch); }; diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 959eaed56..54d78c07c 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -115,7 +115,8 @@ class SubcloudsController(object): def __init__(self): super(SubcloudsController, self).__init__() - self.rpc_client = rpc_client.ManagerClient() + self.dcmanager_rpc_client = rpc_client.ManagerClient() + self.dcmanager_state_rpc_client = rpc_client.SubcloudStateClient() # to do the version compatibility for future purpose def _determine_version_cap(self, target): @@ -988,7 +989,7 @@ class SubcloudsController(object): subcloud = self._add_subcloud_to_database(context, payload) # Ask dcmanager-manager to add the subcloud. # It will do all the real work... - self.rpc_client.add_subcloud(context, payload) + self.dcmanager_rpc_client.add_subcloud(context, payload) return db_api.subcloud_db_model_to_dict(subcloud) except RemoteError as e: pecan.abort(422, e.value) @@ -1077,9 +1078,9 @@ class SubcloudsController(object): data_install = json.dumps(payload[INSTALL_VALUES]) try: - # Inform dcmanager-manager that subcloud has been updated. + # Inform dcmanager that subcloud has been updated. # It will do all the real work... - subcloud = self.rpc_client.update_subcloud( + subcloud = self.dcmanager_rpc_client.update_subcloud( context, subcloud_id, management_state=management_state, description=description, location=location, group_id=group_id, data_install=data_install, force=force_flag) @@ -1115,8 +1116,8 @@ class SubcloudsController(object): pecan.abort(400, msg) try: - subcloud = self.rpc_client.reconfigure_subcloud(context, subcloud_id, - payload) + subcloud = self.dcmanager_rpc_client.reconfigure_subcloud( + context, subcloud_id, payload) return subcloud except RemoteError as e: pecan.abort(422, e.value) @@ -1269,7 +1270,7 @@ class SubcloudsController(object): deploy_status=consts.DEPLOY_STATE_PRE_INSTALL, data_install=data_install) - self.rpc_client.reinstall_subcloud( + self.dcmanager_rpc_client.reinstall_subcloud( context, subcloud_id, payload) return db_api.subcloud_db_model_to_dict(subcloud) @@ -1348,8 +1349,9 @@ class SubcloudsController(object): pecan.abort(400, msg) try: - self.rpc_client.restore_subcloud(context, subcloud_id, - payload) + self.dcmanager_rpc_client.restore_subcloud(context, + subcloud_id, + payload) # Return deploy_status as pre-restore subcloud.deploy_status = consts.DEPLOY_STATE_PRE_RESTORE return db_api.subcloud_db_model_to_dict(subcloud) @@ -1391,7 +1393,8 @@ class SubcloudsController(object): try: # Ask dcmanager-manager to delete the subcloud. # It will do all the real work... - return self.rpc_client.delete_subcloud(context, subcloud_id) + return self.dcmanager_rpc_client.delete_subcloud(context, + subcloud_id) except RemoteError as e: pecan.abort(422, e.value) except Exception as e: @@ -1429,7 +1432,7 @@ class SubcloudsController(object): LOG.info('update %s set %s=%s' % (subcloud_name, endpoint, status)) context = restcomm.extract_context_from_environ() - self.rpc_client.update_subcloud_endpoint_status( + self.dcmanager_state_rpc_client.update_subcloud_endpoint_status( context, subcloud_name, endpoint, status) result = {'result': 'OK'} diff --git a/distributedcloud/dcmanager/audit/auditor.py b/distributedcloud/dcmanager/audit/auditor.py index fa7cc518a..28c79d56a 100644 --- a/distributedcloud/dcmanager/audit/auditor.py +++ b/distributedcloud/dcmanager/audit/auditor.py @@ -15,14 +15,14 @@ class Auditor(object): # todo(abailey): determine if add_metaclass is still required six.add_metaclass(abc.ABCMeta) - def __init__(self, context, dcmanager_rpc_client, endpoint_type): + def __init__(self, context, dcmanager_state_rpc_client, endpoint_type): self.context = context - self.dcmanager_rpc_client = dcmanager_rpc_client + self.state_rpc_client = dcmanager_state_rpc_client self.endpoint_type = endpoint_type def _set_subcloud_sync_status(self, sc_name, sc_sync_status): """Update the sync status for endpoint.""" - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.state_rpc_client.update_subcloud_endpoint_status( self.context, subcloud_name=sc_name, endpoint_type=self.endpoint_type, diff --git a/distributedcloud/dcmanager/audit/firmware_audit.py b/distributedcloud/dcmanager/audit/firmware_audit.py index 896cd8a28..4ecdae063 100644 --- a/distributedcloud/dcmanager/audit/firmware_audit.py +++ b/distributedcloud/dcmanager/audit/firmware_audit.py @@ -70,15 +70,15 @@ class FirmwareAuditData(object): class FirmwareAudit(object): """Manages tasks related to firmware audits.""" - def __init__(self, context, dcmanager_rpc_client): + def __init__(self, context, dcmanager_state_rpc_client): LOG.debug('FirmwareAudit initialization...') self.context = context - self.dcmanager_rpc_client = dcmanager_rpc_client + self.state_rpc_client = dcmanager_state_rpc_client self.audit_count = 0 def _update_subcloud_sync_status(self, sc_name, sc_endpoint_type, sc_status): - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.state_rpc_client.update_subcloud_endpoint_status( self.context, subcloud_name=sc_name, endpoint_type=sc_endpoint_type, diff --git a/distributedcloud/dcmanager/audit/kube_rootca_update_audit.py b/distributedcloud/dcmanager/audit/kube_rootca_update_audit.py index d17d566d2..4e9d2a3da 100644 --- a/distributedcloud/dcmanager/audit/kube_rootca_update_audit.py +++ b/distributedcloud/dcmanager/audit/kube_rootca_update_audit.py @@ -27,10 +27,10 @@ MONITORED_ALARM_ENTITIES = ['system.certificate.kubernetes-root-ca', ] class KubeRootcaUpdateAudit(Auditor): """Manages tasks related to kube rootca update audits.""" - def __init__(self, context, dcmanager_rpc_client): + def __init__(self, context, dcmanager_state_rpc_client): super(KubeRootcaUpdateAudit, self).__init__( context, - dcmanager_rpc_client, + dcmanager_state_rpc_client, dcorch_consts.ENDPOINT_TYPE_KUBE_ROOTCA ) self.audit_type = "kube rootca update" diff --git a/distributedcloud/dcmanager/audit/kubernetes_audit.py b/distributedcloud/dcmanager/audit/kubernetes_audit.py index 4e3a69e56..ac747ec82 100644 --- a/distributedcloud/dcmanager/audit/kubernetes_audit.py +++ b/distributedcloud/dcmanager/audit/kubernetes_audit.py @@ -52,15 +52,15 @@ class KubernetesAuditData(object): class KubernetesAudit(object): """Manages tasks related to kubernetes audits.""" - def __init__(self, context, dcmanager_rpc_client): + def __init__(self, context, dcmanager_state_rpc_client): LOG.debug('KubernetesAudit initialization...') self.context = context - self.dcmanager_rpc_client = dcmanager_rpc_client + self.state_rpc_client = dcmanager_state_rpc_client self.audit_count = 0 def _update_subcloud_sync_status(self, sc_name, sc_endpoint_type, sc_status): - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.state_rpc_client.update_subcloud_endpoint_status( self.context, subcloud_name=sc_name, endpoint_type=sc_endpoint_type, diff --git a/distributedcloud/dcmanager/audit/patch_audit.py b/distributedcloud/dcmanager/audit/patch_audit.py index ec98bc540..1465dd723 100644 --- a/distributedcloud/dcmanager/audit/patch_audit.py +++ b/distributedcloud/dcmanager/audit/patch_audit.py @@ -58,15 +58,15 @@ class PatchAuditData(object): class PatchAudit(object): """Manages tasks related to patch audits.""" - def __init__(self, context, dcmanager_rpc_client): + def __init__(self, context, dcmanager_state_rpc_client): LOG.debug('PatchAudit initialization...') self.context = context - self.dcmanager_rpc_client = dcmanager_rpc_client + self.state_rpc_client = dcmanager_state_rpc_client self.audit_count = 0 def _update_subcloud_sync_status(self, sc_name, sc_endpoint_type, sc_status): - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.state_rpc_client.update_subcloud_endpoint_status( self.context, subcloud_name=sc_name, endpoint_type=sc_endpoint_type, diff --git a/distributedcloud/dcmanager/audit/service.py b/distributedcloud/dcmanager/audit/service.py index 3de83bab0..4c27e6598 100644 --- a/distributedcloud/dcmanager/audit/service.py +++ b/distributedcloud/dcmanager/audit/service.py @@ -66,14 +66,15 @@ class DCManagerAuditService(service.Service): self.subcloud_audit_manager = None def start(self): - self.init_tgm() - self.init_audit_managers() target = oslo_messaging.Target(version=self.rpc_api_version, server=self.host, topic=self.topic) self.target = target self._rpc_server = rpc_messaging.get_rpc_server(self.target, self) self._rpc_server.start() + + self.init_tgm() + self.init_audit_managers() super(DCManagerAuditService, self).start() def init_tgm(self): @@ -99,7 +100,8 @@ class DCManagerAuditService(service.Service): def stop(self): self._stop_rpc_server() - self.TG.stop() + if self.TG: + self.TG.stop() # Terminate the engine process LOG.info("All threads were gone, terminating engine") @@ -228,7 +230,8 @@ class DCManagerAuditWorkerService(service.Service): def stop(self): self._stop_rpc_server() - self.TG.stop() + if self.TG: + self.TG.stop() # Terminate the engine process LOG.info("All threads were gone, terminating audit-worker engine") diff --git a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py index 9858596ea..ff2cf21b3 100644 --- a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py @@ -58,6 +58,7 @@ class SubcloudAuditWorkerManager(manager.Manager): service_name="subcloud_audit_worker_manager") self.context = context.get_admin_context() self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient() + self.state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() # Keeps track of greenthreads we create to do work. self.thread_group_manager = scheduler.ThreadGroupManager( thread_pool_size=100) @@ -66,15 +67,15 @@ class SubcloudAuditWorkerManager(manager.Manager): self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context) # todo(abailey): refactor the design pattern for adding new audits self.patch_audit = patch_audit.PatchAudit( - self.context, self.dcmanager_rpc_client) + self.context, self.state_rpc_client) self.firmware_audit = firmware_audit.FirmwareAudit( - self.context, self.dcmanager_rpc_client) + self.context, self.state_rpc_client) self.kubernetes_audit = kubernetes_audit.KubernetesAudit( - self.context, self.dcmanager_rpc_client) + self.context, self.state_rpc_client) self.kube_rootca_update_audit = \ kube_rootca_update_audit.KubeRootcaUpdateAudit( self.context, - self.dcmanager_rpc_client) + self.state_rpc_client) self.pid = os.getpid() def audit_subclouds(self, @@ -184,14 +185,14 @@ class SubcloudAuditWorkerManager(manager.Manager): update_state_only=False, audit_fail_count=None): try: - self.dcmanager_rpc_client.update_subcloud_availability( + self.state_rpc_client.update_subcloud_availability( self.context, subcloud_name, availability_status, update_state_only, audit_fail_count) - LOG.info('Notifying dcmanager, subcloud:%s, availability:%s' % + LOG.info('Notifying dcmanager-state, subcloud:%s, availability:%s' % (subcloud_name, availability_status)) except Exception: - LOG.exception('Problem informing dcmanager of subcloud ' + LOG.exception('Problem informing dcmanager-state of subcloud ' 'availability state change, subcloud: %s' % subcloud_name) diff --git a/distributedcloud/dcmanager/cmd/state.py b/distributedcloud/dcmanager/cmd/state.py new file mode 100644 index 000000000..d2d236d59 --- /dev/null +++ b/distributedcloud/dcmanager/cmd/state.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# The right to copy, distribute, modify, or otherwise make use +# of this software may be licensed only pursuant to the terms +# of an applicable Wind River license agreement. +# + +""" +DC Manager State Engine Server. +""" + +import eventlet +eventlet.monkey_patch() + +from oslo_config import cfg +from oslo_i18n import _lazy +from oslo_log import log as logging +from oslo_service import service + +from dcmanager.common import config +from dcmanager.common import messaging +from dcorch.common import messaging as dcorch_messaging + +_lazy.enable_lazy() +config.register_options() +config.register_keystone_options() +LOG = logging.getLogger('dcmanager.state') + + +def main(): + logging.register_options(cfg.CONF) + cfg.CONF(project='dcmanager', prog='dcmanager-state') + logging.setup(cfg.CONF, 'dcmanager-state') + logging.set_defaults() + messaging.setup() + dcorch_messaging.setup() + + from dcmanager.state import service as state + + # Override values from /etc/dcmanager/dcmanager.conf specific + # to dcmanager-state: + cfg.CONF.set_override('max_pool_size', 10, group='database') + cfg.CONF.set_override('max_overflow', 100, group='database') + LOG.info("Starting...") + LOG.debug("Configuration:") + cfg.CONF.log_opt_values(LOG, logging.DEBUG) + + LOG.info("Launching service, host=%s, state_workers=%s ...", + cfg.CONF.host, cfg.CONF.state_workers) + srv = state.DCManagerStateService(cfg.CONF.host) + launcher = service.launch(cfg.CONF, srv, workers=cfg.CONF.state_workers) + launcher.wait() + +if __name__ == '__main__': + main() diff --git a/distributedcloud/dcmanager/common/config.py b/distributedcloud/dcmanager/common/config.py index c4ce424ef..b446f3a25 100644 --- a/distributedcloud/dcmanager/common/config.py +++ b/distributedcloud/dcmanager/common/config.py @@ -147,10 +147,15 @@ scheduler_opts = [ ] common_opts = [ + # TODO(kmacleod): these worker parameters should be added to puppet + # in order that they make it into /etc/dcmanager/dcmanager.conf + # for system engineering purposes cfg.IntOpt('workers', default=1, help='number of workers'), cfg.IntOpt('orch_workers', default=1, help='number of orchestrator workers'), + cfg.IntOpt('state_workers', default=4, + help='number of state workers'), cfg.IntOpt('audit_workers', default=1, help='number of audit workers'), cfg.IntOpt('audit_worker_workers', default=4, diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 723677773..70f8a40a3 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -16,11 +16,9 @@ RPC_API_VERSION = "1.0" TOPIC_DC_MANAGER = "dcmanager" - +TOPIC_DC_MANAGER_STATE = "dcmanager-state" TOPIC_DC_MANAGER_AUDIT = "dcmanager-audit" - TOPIC_DC_MANAGER_AUDIT_WORKER = "dcmanager-audit-worker" - TOPIC_DC_MANAGER_ORCHESTRATOR = "dcmanager-orchestrator" CERTS_VAULT_DIR = "/opt/dc-vault/certs" diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index 8ecf9756d..ab0b1c92a 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -22,15 +22,12 @@ import oslo_messaging from oslo_service import service from oslo_utils import uuidutils -from dcorch.common import consts as dcorch_consts - from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client from dcmanager.common import consts from dcmanager.common import context from dcmanager.common import exceptions from dcmanager.common.i18n import _ from dcmanager.common import messaging as rpc_messaging -from dcmanager.common import scheduler from dcmanager.manager.subcloud_manager import SubcloudManager CONF = cfg.CONF @@ -69,21 +66,16 @@ class DCManagerService(service.Service): # The following are initialized here, but assigned in start() which # happens after the fork when spawning multiple worker processes self.engine_id = None - self.TG = None self.target = None self._rpc_server = None self.subcloud_manager = None self.audit_rpc_client = None - def init_tgm(self): - self.TG = scheduler.ThreadGroupManager() - def init_managers(self): self.subcloud_manager = SubcloudManager() def start(self): self.dcmanager_id = uuidutils.generate_uuid() - self.init_tgm() self.init_managers() target = oslo_messaging.Target(version=self.rpc_api_version, server=self.host, @@ -151,60 +143,6 @@ class DCManagerService(service.Service): subcloud_id, payload) - @request_context - def update_subcloud_endpoint_status(self, context, subcloud_name=None, - endpoint_type=None, - sync_status=consts. - SYNC_STATUS_OUT_OF_SYNC, - alarmable=True): - # Updates subcloud endpoint sync status - LOG.info("Handling update_subcloud_endpoint_status request for " - "subcloud: (%s) endpoint: (%s) status:(%s) " - % (subcloud_name, endpoint_type, sync_status)) - - self.subcloud_manager. \ - update_subcloud_endpoint_status(context, - subcloud_name, - endpoint_type, - sync_status, - alarmable) - - # If the patching sync status is being set to unknown, trigger the - # patching audit so it can update the sync status ASAP. - if endpoint_type == dcorch_consts.ENDPOINT_TYPE_PATCHING and \ - sync_status == consts.SYNC_STATUS_UNKNOWN: - self.audit_rpc_client.trigger_patch_audit(context) - - # If the firmware sync status is being set to unknown, trigger the - # firmware audit so it can update the sync status ASAP. - if endpoint_type == dcorch_consts.ENDPOINT_TYPE_FIRMWARE and \ - sync_status == consts.SYNC_STATUS_UNKNOWN: - self.audit_rpc_client.trigger_firmware_audit(context) - - # If the kubernetes sync status is being set to unknown, trigger the - # kubernetes audit so it can update the sync status ASAP. - if endpoint_type == dcorch_consts.ENDPOINT_TYPE_KUBERNETES and \ - sync_status == consts.SYNC_STATUS_UNKNOWN: - self.audit_rpc_client.trigger_kubernetes_audit(context) - - return - - @request_context - def update_subcloud_availability(self, context, - subcloud_name, - availability_status, - update_state_only=False, - audit_fail_count=None): - # Updates subcloud availability - LOG.info("Handling update_subcloud_availability request for: %s" % - subcloud_name) - self.subcloud_manager.update_subcloud_availability( - context, - subcloud_name, - availability_status, - update_state_only, - audit_fail_count) - @request_context def update_subcloud_sync_endpoint_type(self, context, subcloud_name, endpoint_type_list, @@ -228,9 +166,6 @@ class DCManagerService(service.Service): def stop(self): self._stop_rpc_server() - - self.TG.stop() - # Terminate the engine process LOG.info("All threads were gone, terminating engine") super(DCManagerService, self).stop() diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index d6ca6f34b..e1124a2f4 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -48,9 +48,8 @@ from dcmanager.common import exceptions from dcmanager.common.i18n import _ from dcmanager.common import manager from dcmanager.common import utils -from dcmanager.rpc import client as rpc_client - from dcmanager.db import api as db_api +from dcmanager.rpc import client as dcmanager_rpc_client from fm_api import constants as fm_const from fm_api import fm_api @@ -102,23 +101,6 @@ TRANSITORY_STATES = {consts.DEPLOY_STATE_NONE: consts.DEPLOY_STATE_DEPLOY_PREP_F } -def sync_update_subcloud_endpoint_status(func): - """Synchronized lock decorator for _update_subcloud_endpoint_status. """ - - def _get_lock_and_call(*args, **kwargs): - """Get a single fair lock per subcloud based on subcloud name. """ - - # subcloud name is the 3rd argument to - # _update_subcloud_endpoint_status() - @utils.synchronized(args[2], external=False, fair=True) - def _call_func(*args, **kwargs): - return func(*args, **kwargs) - - return _call_func(*args, **kwargs) - - return _get_lock_and_call - - class SubcloudManager(manager.Manager): """Manages tasks related to subclouds.""" @@ -131,6 +113,7 @@ class SubcloudManager(manager.Manager): self.dcorch_rpc_client = dcorch_rpc_client.EngineClient() self.fm_api = fm_api.FaultAPIs() self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient() + self.state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() @staticmethod def _get_subcloud_cert_name(subcloud_name): @@ -1240,11 +1223,10 @@ class SubcloudManager(manager.Manager): location=location) if management_state == consts.MANAGEMENT_UNMANAGED: - # set all endpoint statuses to unknown, except the dc-cert # endpoint which continues to be audited for unmanaged # subclouds - self.update_subcloud_endpoint_status( + self.state_rpc_client.update_subcloud_endpoint_status_sync( context, subcloud_name=subcloud.name, endpoint_type=None, @@ -1254,7 +1236,7 @@ class SubcloudManager(manager.Manager): # Subcloud is managed # Tell cert-mon to audit endpoint certificate LOG.info('Request for managed audit for %s' % subcloud.name) - dc_notification = rpc_client.DCManagerNotifications() + dc_notification = dcmanager_rpc_client.DCManagerNotifications() dc_notification.subcloud_managed(context, subcloud.name) # Since sysinv user is sync'ed during bootstrap, trigger the # related audits. Patch and load audits are delayed until the @@ -1266,417 +1248,6 @@ class SubcloudManager(manager.Manager): return db_api.subcloud_db_model_to_dict(subcloud) - def _do_update_subcloud_endpoint_status(self, context, subcloud_id, - endpoint_type, sync_status, - alarmable, ignore_endpoints=None): - """Update online/managed subcloud endpoint status - - :param context: request context object - :param subcloud_id: id of subcloud to update - :param endpoint_type: endpoint type to update - :param sync_status: sync status to set - :param alarmable: controls raising an alarm if applicable - :param ignore_endpoints: list of endpoints to ignore (only used if - endpoint_type is None) - """ - - if ignore_endpoints is None: - ignore_endpoints = [] - - subcloud_status_list = [] - subcloud = None - original_identity_status = None - # retrieve the info from the db for this subcloud. - # subcloud_id should not be None - try: - for subcloud, subcloud_status in db_api. \ - subcloud_get_with_status(context, subcloud_id): - if subcloud_status: - subcloud_status_list.append( - db_api.subcloud_endpoint_status_db_model_to_dict( - subcloud_status)) - if subcloud_status.endpoint_type == \ - dcorch_consts.ENDPOINT_TYPE_IDENTITY: - original_identity_status = subcloud_status.sync_status - except Exception as e: - LOG.exception(e) - raise e - - if subcloud: - if endpoint_type: - # updating a single endpoint on a single subcloud - for subcloud_status in subcloud_status_list: - if subcloud_status['endpoint_type'] == endpoint_type: - if subcloud_status['sync_status'] == sync_status: - # No change in the sync_status - LOG.debug("Sync status (%s) for subcloud %s did " - "not change - ignore update" % - (sync_status, subcloud.name)) - return - # We found the endpoint - break - else: - # We did not find the endpoint - raise exceptions.BadRequest( - resource='subcloud', - msg='Endpoint %s not found for subcloud' % - endpoint_type) - - LOG.info("Updating subcloud:%s endpoint:%s sync:%s" % - (subcloud.name, endpoint_type, sync_status)) - db_api.subcloud_status_update(context, - subcloud_id, - endpoint_type, - sync_status) - - # Trigger subcloud patch and load audits for the subcloud after - # its identity endpoint turns to other status from unknown - if endpoint_type == dcorch_consts.ENDPOINT_TYPE_IDENTITY \ - and sync_status != consts.SYNC_STATUS_UNKNOWN \ - and original_identity_status == consts.SYNC_STATUS_UNKNOWN: - LOG.debug('Request for patch and load audit for %s after updating ' - 'identity out of unknown' % subcloud.name) - self.audit_rpc_client.trigger_subcloud_patch_load_audits( - context, subcloud_id) - - entity_instance_id = "subcloud=%s.resource=%s" % \ - (subcloud.name, endpoint_type) - fault = self.fm_api.get_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, - entity_instance_id) - - if (sync_status != consts.SYNC_STATUS_OUT_OF_SYNC) \ - and fault: - try: - self.fm_api.clear_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa - entity_instance_id) - except Exception as e: - LOG.exception(e) - - elif not fault and alarmable and \ - (sync_status == consts.SYNC_STATUS_OUT_OF_SYNC): - entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD - try: - fault = fm_api.Fault( - alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa - alarm_state=fm_const.FM_ALARM_STATE_SET, - entity_type_id=entity_type_id, - entity_instance_id=entity_instance_id, - severity=fm_const.FM_ALARM_SEVERITY_MAJOR, - reason_text=("%s %s sync_status is " - "out-of-sync" % - (subcloud.name, endpoint_type)), - alarm_type=fm_const.FM_ALARM_TYPE_0, - probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2, - proposed_repair_action="If problem persists " - "contact next level " - "of support", - service_affecting=False) - - self.fm_api.set_fault(fault) - - except Exception as e: - LOG.exception(e) - - else: - # update all endpoints on this subcloud - LOG.info("Updating all endpoints on subcloud: %s sync: %s " - "ignore_endpoints: %s" % - (subcloud.name, sync_status, ignore_endpoints)) - - # TODO(yuxing): The following code can be further optimized when - # batch alarm clearance APIs are available, so we don't need to - # loop over all the endpoints of a given subcloud, e.g. - # if not ignore_endpoints: - # db_api.subcloud_status_update_endpoints_all(...) - # else: - # db_api.subcloud_status_update_endpoints(...) - endpoint_to_update_list = [] - for entry in subcloud_status_list: - endpoint = entry[consts.ENDPOINT_TYPE] - if endpoint in ignore_endpoints: - # Do not update this endpoint - continue - endpoint_to_update_list.append(endpoint) - - entity_instance_id = "subcloud=%s.resource=%s" % \ - (subcloud.name, endpoint) - - fault = self.fm_api.get_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, - entity_instance_id) - - # TODO(yuxing): batch clear all the out-of-sync alarms of a - # given subcloud if fm_api support it. Be careful with the - # dc-cert endpoint when adding the above; the endpoint - # alarm must remain for offline subclouds. - if (sync_status != consts.SYNC_STATUS_OUT_OF_SYNC) \ - and fault: - try: - self.fm_api.clear_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa - entity_instance_id) - except Exception as e: - LOG.exception(e) - - elif not fault and alarmable and \ - (sync_status == consts.SYNC_STATUS_OUT_OF_SYNC): - entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD - try: - fault = fm_api.Fault( - alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa - alarm_state=fm_const.FM_ALARM_STATE_SET, - entity_type_id=entity_type_id, - entity_instance_id=entity_instance_id, - severity=fm_const.FM_ALARM_SEVERITY_MAJOR, - reason_text=("%s %s sync_status is " - "out-of-sync" % - (subcloud.name, endpoint)), - alarm_type=fm_const.FM_ALARM_TYPE_0, - probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2, - proposed_repair_action="If problem persists " - "contact next level " - "of support", - service_affecting=False) - - self.fm_api.set_fault(fault) - except Exception as e: - LOG.exception(e) - - if endpoint_to_update_list: - try: - db_api.subcloud_status_update_endpoints(context, subcloud_id, - endpoint_to_update_list, - sync_status) - except Exception as e: - LOG.exception(e) - - else: - LOG.error("Subcloud not found:%s" % subcloud_id) - - @sync_update_subcloud_endpoint_status - def _update_subcloud_endpoint_status( - self, context, - subcloud_name, - endpoint_type=None, - sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, - alarmable=True, - ignore_endpoints=None): - """Update subcloud endpoint status - - :param context: request context object - :param subcloud_name: name of subcloud to update - :param endpoint_type: endpoint type to update - :param sync_status: sync status to set - :param alarmable: controls raising an alarm if applicable - :param ignore_endpoints: list of endpoints to ignore (only used if - endpoint_type is None) - """ - - if ignore_endpoints is None: - ignore_endpoints = [] - - if not subcloud_name: - raise exceptions.BadRequest( - resource='subcloud', - msg='Subcloud name not provided') - - try: - subcloud = db_api.subcloud_get_by_name(context, subcloud_name) - except Exception as e: - LOG.exception(e) - raise e - - # Rules for updating sync status: - # - # Always update if not in-sync. - # - # Otherwise, only update the sync status if managed and online - # (unless dc-cert). - # - # Most endpoints are audited only when the subcloud is managed and - # online. An exception is the dc-cert endpoint, which is audited - # whenever the subcloud is online (managed or unmanaged). - # - # This means if a subcloud is going offline or unmanaged, then - # the sync status update must be done first. - # - if (sync_status != consts.SYNC_STATUS_IN_SYNC or - ((subcloud.availability_status == consts.AVAILABILITY_ONLINE) and - (subcloud.management_state == consts.MANAGEMENT_MANAGED - or endpoint_type == dcorch_consts.ENDPOINT_TYPE_DC_CERT))): - # update a single subcloud - try: - self._do_update_subcloud_endpoint_status(context, - subcloud.id, - endpoint_type, - sync_status, - alarmable, - ignore_endpoints) - except Exception as e: - LOG.exception(e) - raise e - else: - LOG.info("Ignoring subcloud sync_status update for subcloud:%s " - "availability:%s management:%s endpoint:%s sync:%s" % - (subcloud_name, subcloud.availability_status, - subcloud.management_state, endpoint_type, sync_status)) - - def update_subcloud_endpoint_status( - self, context, - subcloud_name=None, - endpoint_type=None, - sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, - alarmable=True, - ignore_endpoints=None): - """Update subcloud endpoint status - - :param context: request context object - :param subcloud_name: name of subcloud to update - :param endpoint_type: endpoint type to update - :param sync_status: sync status to set - :param alarmable: controls raising an alarm if applicable - :param ignore_endpoints: list of endpoints to ignore (only used if - endpoint_type is None) - """ - - if ignore_endpoints is None: - ignore_endpoints = [] - - if subcloud_name: - self._update_subcloud_endpoint_status( - context, subcloud_name, endpoint_type, sync_status, alarmable, - ignore_endpoints) - else: - # update all subclouds - for subcloud in db_api.subcloud_get_all(context): - self._update_subcloud_endpoint_status( - context, subcloud.name, endpoint_type, sync_status, - alarmable, ignore_endpoints) - - def _update_subcloud_state(self, context, subcloud_name, - management_state, availability_status): - try: - self.dcorch_rpc_client.update_subcloud_states( - context, subcloud_name, management_state, availability_status) - - LOG.info('Notifying dcorch, subcloud:%s management: %s, ' - 'availability:%s' % - (subcloud_name, - management_state, - availability_status)) - except Exception: - LOG.exception('Problem informing dcorch of subcloud state change,' - 'subcloud: %s' % subcloud_name) - - def _raise_or_clear_subcloud_status_alarm(self, subcloud_name, - availability_status): - entity_instance_id = "subcloud=%s" % subcloud_name - fault = self.fm_api.get_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, - entity_instance_id) - - if fault and (availability_status == consts.AVAILABILITY_ONLINE): - try: - self.fm_api.clear_fault( - fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, - entity_instance_id) - except Exception: - LOG.exception("Failed to clear offline alarm for subcloud: %s", - subcloud_name) - - elif not fault and \ - (availability_status == consts.AVAILABILITY_OFFLINE): - try: - fault = fm_api.Fault( - alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, - alarm_state=fm_const.FM_ALARM_STATE_SET, - entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD, - entity_instance_id=entity_instance_id, - - severity=fm_const.FM_ALARM_SEVERITY_CRITICAL, - reason_text=('%s is offline' % subcloud_name), - alarm_type=fm_const.FM_ALARM_TYPE_0, - probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29, - proposed_repair_action="Wait for subcloud to " - "become online; if " - "problem persists contact " - "next level of support.", - service_affecting=True) - - self.fm_api.set_fault(fault) - except Exception: - LOG.exception("Failed to raise offline alarm for subcloud: %s", - subcloud_name) - - def update_subcloud_availability(self, context, subcloud_name, - availability_status, - update_state_only=False, - audit_fail_count=None): - try: - subcloud = db_api.subcloud_get_by_name(context, subcloud_name) - except Exception: - LOG.exception("Failed to get subcloud by name: %s" % subcloud_name) - raise - - if update_state_only: - # Nothing has changed, but we want to send a state update for this - # subcloud as an audit. Get the most up-to-date data. - self._update_subcloud_state(context, subcloud_name, - subcloud.management_state, - availability_status) - elif availability_status is None: - # only update the audit fail count - try: - db_api.subcloud_update(self.context, subcloud.id, - audit_fail_count=audit_fail_count) - except exceptions.SubcloudNotFound: - # slim possibility subcloud could have been deleted since - # we found it in db, ignore this benign error. - LOG.info('Ignoring SubcloudNotFound when attempting ' - 'audit_fail_count update: %s' % subcloud_name) - return - else: - self._raise_or_clear_subcloud_status_alarm(subcloud_name, - availability_status) - - if availability_status == consts.AVAILABILITY_OFFLINE: - # Subcloud is going offline, set all endpoint statuses to - # unknown. - self._update_subcloud_endpoint_status( - context, subcloud_name, endpoint_type=None, - sync_status=consts.SYNC_STATUS_UNKNOWN) - - try: - updated_subcloud = db_api.subcloud_update( - context, - subcloud.id, - availability_status=availability_status, - audit_fail_count=audit_fail_count) - except exceptions.SubcloudNotFound: - # slim possibility subcloud could have been deleted since - # we found it in db, ignore this benign error. - LOG.info('Ignoring SubcloudNotFound when attempting state' - ' update: %s' % subcloud_name) - return - - if availability_status == consts.AVAILABILITY_ONLINE: - # Subcloud is going online - # Tell cert-mon to audit endpoint certificate. - LOG.info('Request for online audit for %s' % subcloud_name) - dc_notification = rpc_client.DCManagerNotifications() - dc_notification.subcloud_online(context, subcloud_name) - # Trigger all the audits for the subcloud so it can update the - # sync status ASAP. - self.audit_rpc_client.trigger_subcloud_audits(context, - subcloud.id) - - # Send dcorch a state update - self._update_subcloud_state(context, subcloud_name, - updated_subcloud.management_state, - availability_status) - def update_subcloud_sync_endpoint_type(self, context, subcloud_name, endpoint_type_list, @@ -1719,10 +1290,7 @@ class SubcloudManager(manager.Manager): ' type change, subcloud: %s' % subcloud_name) def handle_subcloud_operations_in_progress(self): - """Identify subclouds in transitory stages and update subcloud deploy - - state to failure. - """ + """Identify subclouds in transitory stages and update subcloud deploy state to failure.""" LOG.info('Identifying subclouds in transitory stages.') diff --git a/distributedcloud/dcmanager/orchestrator/states/firmware/finishing_fw_update.py b/distributedcloud/dcmanager/orchestrator/states/firmware/finishing_fw_update.py index f5e9e56ef..b5a6acd44 100644 --- a/distributedcloud/dcmanager/orchestrator/states/firmware/finishing_fw_update.py +++ b/distributedcloud/dcmanager/orchestrator/states/firmware/finishing_fw_update.py @@ -32,9 +32,9 @@ class FinishingFwUpdateState(BaseState): "Setting endpoint status of %s to %s" % (dcorch_consts.ENDPOINT_TYPE_FIRMWARE, consts.SYNC_STATUS_IN_SYNC)) - rpc_client = dcmanager_rpc_client.ManagerClient() + dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() # The subcloud name is the same as the region in the strategy_step - rpc_client.update_subcloud_endpoint_status( + dcmanager_state_rpc_client.update_subcloud_endpoint_status( self.context, subcloud_name=self.get_region_name(strategy_step), endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, diff --git a/distributedcloud/dcmanager/rpc/client.py b/distributedcloud/dcmanager/rpc/client.py index ad8e57469..a7de0aaa3 100644 --- a/distributedcloud/dcmanager/rpc/client.py +++ b/distributedcloud/dcmanager/rpc/client.py @@ -56,6 +56,56 @@ class RPCClient(object): return client.cast(ctxt, method, **kwargs) +class SubcloudStateClient(RPCClient): + """Client to update subcloud availability.""" + + BASE_RPC_API_VERSION = '1.0' + + def __init__(self): + super(SubcloudStateClient, self).__init__( + consts.TOPIC_DC_MANAGER_STATE, + self.BASE_RPC_API_VERSION) + + def update_subcloud_availability(self, ctxt, + subcloud_name, + availability_status, + update_state_only=False, + audit_fail_count=None): + # Note: synchronous + return self.call( + ctxt, + self.make_msg('update_subcloud_availability', + subcloud_name=subcloud_name, + availability_status=availability_status, + update_state_only=update_state_only, + audit_fail_count=audit_fail_count)) + + def update_subcloud_endpoint_status(self, ctxt, subcloud_name=None, + endpoint_type=None, + sync_status=consts. + SYNC_STATUS_OUT_OF_SYNC, + ignore_endpoints=None): + # Note: This is an asynchronous operation. + # See below for synchronous method call + return self.cast(ctxt, self.make_msg('update_subcloud_endpoint_status', + subcloud_name=subcloud_name, + endpoint_type=endpoint_type, + sync_status=sync_status, + ignore_endpoints=ignore_endpoints)) + + def update_subcloud_endpoint_status_sync(self, ctxt, subcloud_name=None, + endpoint_type=None, + sync_status=consts. + SYNC_STATUS_OUT_OF_SYNC, + ignore_endpoints=None): + # Note: synchronous + return self.call(ctxt, self.make_msg('update_subcloud_endpoint_status', + subcloud_name=subcloud_name, + endpoint_type=endpoint_type, + sync_status=sync_status, + ignore_endpoints=ignore_endpoints)) + + class ManagerClient(RPCClient): """Client side of the DC Manager rpc API. @@ -105,28 +155,6 @@ class ManagerClient(RPCClient): subcloud_id=subcloud_id, payload=payload)) - def update_subcloud_endpoint_status(self, ctxt, subcloud_name=None, - endpoint_type=None, - sync_status=consts. - SYNC_STATUS_OUT_OF_SYNC): - return self.cast(ctxt, self.make_msg('update_subcloud_endpoint_status', - subcloud_name=subcloud_name, - endpoint_type=endpoint_type, - sync_status=sync_status)) - - def update_subcloud_availability(self, ctxt, - subcloud_name, - availability_status, - update_state_only=False, - audit_fail_count=None): - return self.call( - ctxt, - self.make_msg('update_subcloud_availability', - subcloud_name=subcloud_name, - availability_status=availability_status, - update_state_only=update_state_only, - audit_fail_count=audit_fail_count)) - def update_subcloud_sync_endpoint_type(self, ctxt, subcloud_name, endpoint_type_list, diff --git a/distributedcloud/dcmanager/state/README.rst b/distributedcloud/dcmanager/state/README.rst new file mode 100755 index 000000000..9fc12b4ef --- /dev/null +++ b/distributedcloud/dcmanager/state/README.rst @@ -0,0 +1,12 @@ +=============================== +Service +=============================== + +DC Manager State Service has responsibility for: + Subcloud state updates coming from dcmanager-manager service + +service.py: + run DC Manager State Service in multi-worker mode, and establish RPC server + +subcloud_state_manager.py: + Provide subcloud state updates diff --git a/distributedcloud/dcmanager/state/__init__.py b/distributedcloud/dcmanager/state/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/distributedcloud/dcmanager/state/service.py b/distributedcloud/dcmanager/state/service.py new file mode 100644 index 000000000..e4b57b047 --- /dev/null +++ b/distributedcloud/dcmanager/state/service.py @@ -0,0 +1,166 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Copyright (c) 2017-2022 Wind River Systems, Inc. +# +# The right to copy, distribute, modify, or otherwise make use +# of this software may be licensed only pursuant to the terms +# of an applicable Wind River license agreement. +# + +import functools +import six + +from oslo_config import cfg +from oslo_log import log as logging +import oslo_messaging +from oslo_service import service + +from dcorch.common import consts as dcorch_consts + +from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common import exceptions +from dcmanager.common.i18n import _ +from dcmanager.common import messaging as rpc_messaging +from dcmanager.state.subcloud_state_manager import SubcloudStateManager + +LOG = logging.getLogger(__name__) + + +def request_context(func): + @functools.wraps(func) + def wrapped(self, ctx, *args, **kwargs): + if ctx is not None and not isinstance(ctx, context.RequestContext): + ctx = context.RequestContext.from_dict(ctx.to_dict()) + try: + return func(self, ctx, *args, **kwargs) + except exceptions.DCManagerException: + raise oslo_messaging.rpc.dispatcher.ExpectedException() + + return wrapped + + +class DCManagerStateService(service.Service): + """Lifecycle manager for a running service. + + - All the methods in here are called from the RPC client. + - If a RPC call does not have a corresponding method here, an exception + will be thrown. + - Arguments to these calls are added dynamically and will be treated as + keyword arguments by the RPC client. + """ + + def __init__(self, host): + super(DCManagerStateService, self).__init__() + self.host = cfg.CONF.host + self.rpc_api_version = consts.RPC_API_VERSION + self.topic = consts.TOPIC_DC_MANAGER_STATE + # The following are initialized here, but assigned in start() which + # happens after the fork when spawning multiple worker processes + self.engine_id = None + self.target = None + self._rpc_server = None + self.subcloud_state_manager = None + self.audit_rpc_client = None + + def _init_managers(self): + self.subcloud_state_manager = SubcloudStateManager() + + def start(self): + LOG.info("Starting %s", self.__class__.__name__) + self._init_managers() + target = oslo_messaging.Target(version=self.rpc_api_version, + server=self.host, + topic=self.topic) + self.target = target + self._rpc_server = rpc_messaging.get_rpc_server(self.target, self) + self._rpc_server.start() + # Used to notify dcmanager-audit + self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient() + + super(DCManagerStateService, self).start() + + def _stop_rpc_server(self): + # Stop RPC connection to prevent new requests + LOG.debug(_("Attempting to stop engine service...")) + try: + self._rpc_server.stop() + self._rpc_server.wait() + LOG.info('Engine service stopped successfully') + except Exception as ex: + LOG.error('Failed to stop engine service: %s', + six.text_type(ex)) + + def stop(self): + LOG.info("Stopping %s", self.__class__.__name__) + self._stop_rpc_server() + # Terminate the engine process + LOG.info("All threads were gone, terminating engine") + super(DCManagerStateService, self).stop() + + @request_context + def update_subcloud_endpoint_status(self, context, subcloud_name=None, + endpoint_type=None, + sync_status=consts. + SYNC_STATUS_OUT_OF_SYNC, + alarmable=True, + ignore_endpoints=None): + # Updates subcloud endpoint sync status + LOG.info("Handling update_subcloud_endpoint_status request for " + "subcloud: (%s) endpoint: (%s) status:(%s) " + % (subcloud_name, endpoint_type, sync_status)) + + self.subcloud_state_manager. \ + update_subcloud_endpoint_status(context, + subcloud_name, + endpoint_type, + sync_status, + alarmable, + ignore_endpoints) + + # If the patching sync status is being set to unknown, trigger the + # patching audit so it can update the sync status ASAP. + if endpoint_type == dcorch_consts.ENDPOINT_TYPE_PATCHING and \ + sync_status == consts.SYNC_STATUS_UNKNOWN: + self.audit_rpc_client.trigger_patch_audit(context) + + # If the firmware sync status is being set to unknown, trigger the + # firmware audit so it can update the sync status ASAP. + if endpoint_type == dcorch_consts.ENDPOINT_TYPE_FIRMWARE and \ + sync_status == consts.SYNC_STATUS_UNKNOWN: + self.audit_rpc_client.trigger_firmware_audit(context) + + # If the kubernetes sync status is being set to unknown, trigger the + # kubernetes audit so it can update the sync status ASAP. + if endpoint_type == dcorch_consts.ENDPOINT_TYPE_KUBERNETES and \ + sync_status == consts.SYNC_STATUS_UNKNOWN: + self.audit_rpc_client.trigger_kubernetes_audit(context) + + return + + @request_context + def update_subcloud_availability(self, context, + subcloud_name, + availability_status, + update_state_only=False, + audit_fail_count=None): + # Updates subcloud availability + LOG.info("Handling update_subcloud_availability request for: %s" % + subcloud_name) + self.subcloud_state_manager.update_subcloud_availability( + context, + subcloud_name, + availability_status, + update_state_only, + audit_fail_count) diff --git a/distributedcloud/dcmanager/state/subcloud_state_manager.py b/distributedcloud/dcmanager/state/subcloud_state_manager.py new file mode 100644 index 000000000..360b176f7 --- /dev/null +++ b/distributedcloud/dcmanager/state/subcloud_state_manager.py @@ -0,0 +1,522 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Copyright (c) 2017-2022 Wind River Systems, Inc. +# +# The right to copy, distribute, modify, or otherwise make use +# of this software may be licensed only pursuant to the terms +# of an applicable Wind River license agreement. +# + +from oslo_log import log as logging + +from dcorch.common import consts as dcorch_consts +from dcorch.rpc import client as dcorch_rpc_client + +from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common import exceptions +from dcmanager.common import manager +from dcmanager.common import utils +from dcmanager.rpc import client as rpc_client + +from dcmanager.db import api as db_api + +from fm_api import constants as fm_const +from fm_api import fm_api + +LOG = logging.getLogger(__name__) + + +def sync_update_subcloud_endpoint_status(func): + """Synchronized lock decorator for _update_subcloud_endpoint_status. """ + + def _get_lock_and_call(*args, **kwargs): + """Get a single fair lock per subcloud based on subcloud name. """ + + # subcloud name is the 3rd argument to + # _update_subcloud_endpoint_status() + @utils.synchronized(args[2], external=True, fair=True) + def _call_func(*args, **kwargs): + return func(*args, **kwargs) + + return _call_func(*args, **kwargs) + + return _get_lock_and_call + + +class SubcloudStateManager(manager.Manager): + """Manages tasks related to subclouds.""" + + def __init__(self, *args, **kwargs): + LOG.debug('SubcloudStateManager initialization...') + + super(SubcloudStateManager, + self).__init__(service_name="subcloud_manager", *args, **kwargs) + self.context = context.get_admin_context() + self.dcorch_rpc_client = dcorch_rpc_client.EngineClient() + self.fm_api = fm_api.FaultAPIs() + self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient() + + def _do_update_subcloud_endpoint_status(self, context, subcloud_id, + endpoint_type, sync_status, + alarmable, ignore_endpoints=None): + """Update online/managed subcloud endpoint status + + :param context: request context object + :param subcloud_id: id of subcloud to update + :param endpoint_type: endpoint type to update + :param sync_status: sync status to set + :param alarmable: controls raising an alarm if applicable + :param ignore_endpoints: list of endpoints to ignore (only used if + endpoint_type is None) + """ + + if ignore_endpoints is None: + ignore_endpoints = [] + + subcloud_status_list = [] + subcloud = None + original_identity_status = None + # retrieve the info from the db for this subcloud. + # subcloud_id should not be None + try: + for subcloud, subcloud_status in db_api. \ + subcloud_get_with_status(context, subcloud_id): + if subcloud_status: + subcloud_status_list.append( + db_api.subcloud_endpoint_status_db_model_to_dict( + subcloud_status)) + if subcloud_status.endpoint_type == \ + dcorch_consts.ENDPOINT_TYPE_IDENTITY: + original_identity_status = subcloud_status.sync_status + except Exception as e: + LOG.exception(e) + raise e + + if subcloud: + if endpoint_type: + # updating a single endpoint on a single subcloud + for subcloud_status in subcloud_status_list: + if subcloud_status['endpoint_type'] == endpoint_type: + if subcloud_status['sync_status'] == sync_status: + # No change in the sync_status + LOG.debug("Sync status (%s) for subcloud %s did " + "not change - ignore update" % + (sync_status, subcloud.name)) + return + # We found the endpoint + break + else: + # We did not find the endpoint + raise exceptions.BadRequest( + resource='subcloud', + msg='Endpoint %s not found for subcloud' % + endpoint_type) + + LOG.info("Updating subcloud:%s endpoint:%s sync:%s" % + (subcloud.name, endpoint_type, sync_status)) + db_api.subcloud_status_update(context, + subcloud_id, + endpoint_type, + sync_status) + + # Trigger subcloud patch and load audits for the subcloud after + # its identity endpoint turns to other status from unknown + if endpoint_type == dcorch_consts.ENDPOINT_TYPE_IDENTITY \ + and sync_status != consts.SYNC_STATUS_UNKNOWN \ + and original_identity_status == consts.SYNC_STATUS_UNKNOWN: + LOG.debug('Request for patch and load audit for %s after updating ' + 'identity out of unknown' % subcloud.name) + self.audit_rpc_client.trigger_subcloud_patch_load_audits( + context, subcloud_id) + + entity_instance_id = "subcloud=%s.resource=%s" % \ + (subcloud.name, endpoint_type) + fault = self.fm_api.get_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, + entity_instance_id) + + if (sync_status != consts.SYNC_STATUS_OUT_OF_SYNC) \ + and fault: + try: + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa + entity_instance_id) + except Exception as e: + LOG.exception(e) + + elif not fault and alarmable and \ + (sync_status == consts.SYNC_STATUS_OUT_OF_SYNC): + entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD + try: + fault = fm_api.Fault( + alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=entity_type_id, + entity_instance_id=entity_instance_id, + severity=fm_const.FM_ALARM_SEVERITY_MAJOR, + reason_text=("%s %s sync_status is " + "out-of-sync" % + (subcloud.name, endpoint_type)), + alarm_type=fm_const.FM_ALARM_TYPE_0, + probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2, + proposed_repair_action="If problem persists " + "contact next level " + "of support", + service_affecting=False) + + self.fm_api.set_fault(fault) + + except Exception as e: + LOG.exception(e) + + else: + # update all endpoints on this subcloud + LOG.info("Updating all endpoints on subcloud: %s sync: %s " + "ignore_endpoints: %s" % + (subcloud.name, sync_status, ignore_endpoints)) + + # TODO(yuxing): The following code can be further optimized when + # batch alarm clearance APIs are available, so we don't need to + # loop over all the endpoints of a given subcloud, e.g. + # if not ignore_endpoints: + # db_api.subcloud_status_update_endpoints_all(...) + # else: + # db_api.subcloud_status_update_endpoints(...) + endpoint_to_update_list = [] + for entry in subcloud_status_list: + endpoint = entry[consts.ENDPOINT_TYPE] + if endpoint in ignore_endpoints: + # Do not update this endpoint + continue + endpoint_to_update_list.append(endpoint) + + entity_instance_id = "subcloud=%s.resource=%s" % \ + (subcloud.name, endpoint) + + fault = self.fm_api.get_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, + entity_instance_id) + + # TODO(yuxing): batch clear all the out-of-sync alarms of a + # given subcloud if fm_api support it. Be careful with the + # dc-cert endpoint when adding the above; the endpoint + # alarm must remain for offline subclouds. + if (sync_status != consts.SYNC_STATUS_OUT_OF_SYNC) \ + and fault: + try: + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa + entity_instance_id) + except Exception as e: + LOG.exception(e) + + elif not fault and alarmable and \ + (sync_status == consts.SYNC_STATUS_OUT_OF_SYNC): + entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD + try: + fault = fm_api.Fault( + alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, # noqa + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=entity_type_id, + entity_instance_id=entity_instance_id, + severity=fm_const.FM_ALARM_SEVERITY_MAJOR, + reason_text=("%s %s sync_status is " + "out-of-sync" % + (subcloud.name, endpoint)), + alarm_type=fm_const.FM_ALARM_TYPE_0, + probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2, + proposed_repair_action="If problem persists " + "contact next level " + "of support", + service_affecting=False) + + self.fm_api.set_fault(fault) + except Exception as e: + LOG.exception(e) + + if endpoint_to_update_list: + try: + db_api.subcloud_status_update_endpoints(context, subcloud_id, + endpoint_to_update_list, + sync_status) + except Exception as e: + LOG.exception(e) + + else: + LOG.error("Subcloud not found:%s" % subcloud_id) + + @sync_update_subcloud_endpoint_status + def _update_subcloud_endpoint_status( + self, context, + subcloud_name, + endpoint_type=None, + sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, + alarmable=True, + ignore_endpoints=None): + """Update subcloud endpoint status + + :param context: request context object + :param subcloud_name: name of subcloud to update + :param endpoint_type: endpoint type to update + :param sync_status: sync status to set + :param alarmable: controls raising an alarm if applicable + :param ignore_endpoints: list of endpoints to ignore (only used if + endpoint_type is None) + """ + + if ignore_endpoints is None: + ignore_endpoints = [] + + if not subcloud_name: + raise exceptions.BadRequest( + resource='subcloud', + msg='Subcloud name not provided') + + try: + subcloud = db_api.subcloud_get_by_name(context, subcloud_name) + except Exception as e: + LOG.exception(e) + raise e + + # Rules for updating sync status: + # + # Always update if not in-sync. + # + # Otherwise, only update the sync status if managed and online + # (unless dc-cert). + # + # Most endpoints are audited only when the subcloud is managed and + # online. An exception is the dc-cert endpoint, which is audited + # whenever the subcloud is online (managed or unmanaged). + # + # This means if a subcloud is going offline or unmanaged, then + # the sync status update must be done first. + # + if (sync_status != consts.SYNC_STATUS_IN_SYNC or + ((subcloud.availability_status == consts.AVAILABILITY_ONLINE) and + (subcloud.management_state == consts.MANAGEMENT_MANAGED + or endpoint_type == dcorch_consts.ENDPOINT_TYPE_DC_CERT))): + # update a single subcloud + try: + self._do_update_subcloud_endpoint_status(context, + subcloud.id, + endpoint_type, + sync_status, + alarmable, + ignore_endpoints) + except Exception as e: + LOG.exception(e) + raise e + else: + LOG.info("Ignoring subcloud sync_status update for subcloud:%s " + "availability:%s management:%s endpoint:%s sync:%s" % + (subcloud_name, subcloud.availability_status, + subcloud.management_state, endpoint_type, sync_status)) + + def update_subcloud_endpoint_status( + self, context, + subcloud_name=None, + endpoint_type=None, + sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, + alarmable=True, + ignore_endpoints=None): + """Update subcloud endpoint status + + :param context: request context object + :param subcloud_name: name of subcloud to update + :param endpoint_type: endpoint type to update + :param sync_status: sync status to set + :param alarmable: controls raising an alarm if applicable + :param ignore_endpoints: list of endpoints to ignore (only used if + endpoint_type is None) + """ + + if ignore_endpoints is None: + ignore_endpoints = [] + + if subcloud_name: + self._update_subcloud_endpoint_status( + context, subcloud_name, endpoint_type, sync_status, alarmable, + ignore_endpoints) + else: + # update all subclouds + for subcloud in db_api.subcloud_get_all(context): + self._update_subcloud_endpoint_status( + context, subcloud.name, endpoint_type, sync_status, + alarmable, ignore_endpoints) + + def _update_subcloud_state(self, context, subcloud_name, + management_state, availability_status): + try: + LOG.info('Notifying dcorch, subcloud:%s management: %s, ' + 'availability:%s' % + (subcloud_name, + management_state, + availability_status)) + + self.dcorch_rpc_client.update_subcloud_states( + context, subcloud_name, management_state, availability_status) + + except Exception: + LOG.exception('Problem informing dcorch of subcloud state change,' + 'subcloud: %s' % subcloud_name) + + def _raise_or_clear_subcloud_status_alarm(self, subcloud_name, + availability_status): + entity_instance_id = "subcloud=%s" % subcloud_name + fault = self.fm_api.get_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, + entity_instance_id) + + if fault and (availability_status == consts.AVAILABILITY_ONLINE): + try: + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, + entity_instance_id) + except Exception: + LOG.exception("Failed to clear offline alarm for subcloud: %s", + subcloud_name) + + elif not fault and \ + (availability_status == consts.AVAILABILITY_OFFLINE): + try: + fault = fm_api.Fault( + alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD, + entity_instance_id=entity_instance_id, + + severity=fm_const.FM_ALARM_SEVERITY_CRITICAL, + reason_text=('%s is offline' % subcloud_name), + alarm_type=fm_const.FM_ALARM_TYPE_0, + probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29, + proposed_repair_action="Wait for subcloud to " + "become online; if " + "problem persists contact " + "next level of support.", + service_affecting=True) + + self.fm_api.set_fault(fault) + except Exception: + LOG.exception("Failed to raise offline alarm for subcloud: %s", + subcloud_name) + + def update_subcloud_availability(self, context, subcloud_name, + availability_status, + update_state_only=False, + audit_fail_count=None): + try: + subcloud = db_api.subcloud_get_by_name(context, subcloud_name) + except Exception: + LOG.exception("Failed to get subcloud by name: %s" % subcloud_name) + raise + + if update_state_only: + # Nothing has changed, but we want to send a state update for this + # subcloud as an audit. Get the most up-to-date data. + self._update_subcloud_state(context, subcloud_name, + subcloud.management_state, + availability_status) + elif availability_status is None: + # only update the audit fail count + try: + db_api.subcloud_update(self.context, subcloud.id, + audit_fail_count=audit_fail_count) + except exceptions.SubcloudNotFound: + # slim possibility subcloud could have been deleted since + # we found it in db, ignore this benign error. + LOG.info('Ignoring SubcloudNotFound when attempting ' + 'audit_fail_count update: %s' % subcloud_name) + return + else: + self._raise_or_clear_subcloud_status_alarm(subcloud_name, + availability_status) + + if availability_status == consts.AVAILABILITY_OFFLINE: + # Subcloud is going offline, set all endpoint statuses to + # unknown. + self._update_subcloud_endpoint_status( + context, subcloud_name, endpoint_type=None, + sync_status=consts.SYNC_STATUS_UNKNOWN) + + try: + updated_subcloud = db_api.subcloud_update( + context, + subcloud.id, + availability_status=availability_status, + audit_fail_count=audit_fail_count) + except exceptions.SubcloudNotFound: + # slim possibility subcloud could have been deleted since + # we found it in db, ignore this benign error. + LOG.info('Ignoring SubcloudNotFound when attempting state' + ' update: %s' % subcloud_name) + return + + if availability_status == consts.AVAILABILITY_ONLINE: + # Subcloud is going online + # Tell cert-mon to audit endpoint certificate. + LOG.info('Request for online audit for %s' % subcloud_name) + dc_notification = rpc_client.DCManagerNotifications() + dc_notification.subcloud_online(context, subcloud_name) + # Trigger all the audits for the subcloud so it can update the + # sync status ASAP. + self.audit_rpc_client.trigger_subcloud_audits(context, + subcloud.id) + + # Send dcorch a state update + self._update_subcloud_state(context, subcloud_name, + updated_subcloud.management_state, + availability_status) + + def update_subcloud_sync_endpoint_type(self, context, + subcloud_name, + endpoint_type_list, + openstack_installed): + operation = 'add' if openstack_installed else 'remove' + func_switcher = { + 'add': ( + self.dcorch_rpc_client.add_subcloud_sync_endpoint_type, + db_api.subcloud_status_create + ), + 'remove': ( + self.dcorch_rpc_client.remove_subcloud_sync_endpoint_type, + db_api.subcloud_status_delete + ) + } + + try: + subcloud = db_api.subcloud_get_by_name(context, subcloud_name) + except Exception: + LOG.exception("Failed to get subcloud by name: %s" % subcloud_name) + raise + + try: + # Notify dcorch to add/remove sync endpoint type list + func_switcher[operation][0](self.context, subcloud_name, + endpoint_type_list) + LOG.info('Notifying dcorch, subcloud: %s new sync endpoint: %s' % + (subcloud_name, endpoint_type_list)) + + # Update subcloud status table by adding/removing openstack sync + # endpoint types + for endpoint_type in endpoint_type_list: + func_switcher[operation][1](self.context, subcloud.id, + endpoint_type) + # Update openstack_installed of subcloud table + db_api.subcloud_update(self.context, subcloud.id, + openstack_installed=openstack_installed) + except Exception: + LOG.exception('Problem informing dcorch of subcloud sync endpoint' + ' type change, subcloud: %s' % subcloud_name) diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py index f090fd481..1bfd922b3 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py @@ -240,6 +240,10 @@ class TestSubcloudPost(testroot.DCManagerApiTest, self.mock_rpc_client = p.start() self.addCleanup(p.stop) + p = mock.patch.object(rpc_client, 'SubcloudStateClient') + self.mock_rpc_state_client = p.start() + self.addCleanup(p.stop) + def _verify_post_failure(self, response, param, value): self.assertEqual(http_client.BAD_REQUEST, response.status_code, @@ -773,6 +777,10 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): super(TestSubcloudAPIOther, self).setUp() self.ctx = utils.dummy_context() + p = mock.patch.object(rpc_client, 'SubcloudStateClient') + self.mock_rpc_state_client = p.start() + self.addCleanup(p.stop) + @mock.patch.object(rpc_client, 'ManagerClient') def test_delete_subcloud(self, mock_rpc_client): subcloud = fake_subcloud.create_fake_subcloud(self.ctx) @@ -1162,19 +1170,20 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): headers=FAKE_HEADERS, params=data) @mock.patch.object(rpc_client, 'ManagerClient') + @mock.patch.object(rpc_client, 'SubcloudStateClient') @mock.patch.object(subclouds.SubcloudsController, '_get_updatestatus_payload') def test_subcloud_updatestatus(self, mock_get_updatestatus_payload, - mock_rpc_client): + mock_rpc_state_client, _): subcloud = fake_subcloud.create_fake_subcloud(self.ctx) data = {'endpoint': 'dc-cert', 'status': 'in-sync'} mock_get_updatestatus_payload.return_value = data - mock_rpc_client().update_subcloud_endpoint_status.return_value = True + mock_rpc_state_client().update_subcloud_endpoint_status.return_value = True response = self.app.patch_json( FAKE_URL + '/' + str(subcloud.id) + '/update_status', data, headers=FAKE_HEADERS) - mock_rpc_client().update_subcloud_endpoint_status.assert_called_once_with( + mock_rpc_state_client().update_subcloud_endpoint_status.assert_called_once_with( mock.ANY, subcloud.name, 'dc-cert', diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_firmware_audit_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_firmware_audit_manager.py index c6d815a88..6604f12f9 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_firmware_audit_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_firmware_audit_manager.py @@ -32,15 +32,13 @@ from dcorch.common import consts as dcorch_consts CONF = cfg.CONF -class FakeDCManagerAPI(object): - +class FakeDCManagerStateAPI(object): def __init__(self): self.update_subcloud_availability = mock.MagicMock() self.update_subcloud_endpoint_status = mock.MagicMock() class FakeAuditWorkerAPI(object): - def __init__(self): self.audit_subclouds = mock.MagicMock() @@ -405,11 +403,12 @@ class TestFirmwareAudit(base.DCManagerTestCase): super(TestFirmwareAudit, self).setUp() self.ctxt = utils.dummy_context() - # Mock the DCManager API - self.fake_dcmanager_api = FakeDCManagerAPI() - p = mock.patch('dcmanager.rpc.client.ManagerClient') - self.mock_dcmanager_api = p.start() - self.mock_dcmanager_api.return_value = self.fake_dcmanager_api + # Mock the DCManager subcloud state API + self.fake_dcmanager_state_api = FakeDCManagerStateAPI() + p = mock.patch('dcmanager.rpc.client.SubcloudStateClient') + self.mock_dcmanager_state_api = p.start() + self.mock_dcmanager_state_api.return_value = \ + self.fake_dcmanager_state_api self.addCleanup(p.stop) # Mock the Audit Worker API @@ -436,10 +435,10 @@ class TestFirmwareAudit(base.DCManagerTestCase): def test_init(self): fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) self.assertIsNotNone(fm) self.assertEqual(self.ctxt, fm.context) - self.assertEqual(self.fake_dcmanager_api, fm.dcmanager_rpc_client) + self.assertEqual(self.fake_dcmanager_state_api, fm.state_rpc_client) @mock.patch.object(patch_audit, 'SysinvClient') @mock.patch.object(patch_audit, 'PatchingClient') @@ -458,7 +457,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientNoAuditData fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -470,7 +469,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -490,7 +489,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientNoEnabledDevices fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -502,7 +501,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -521,7 +520,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientImageWithoutLabels fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -533,7 +532,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -552,7 +551,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientImageNotApplied fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -564,7 +563,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -583,7 +582,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientImageNotWritten fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -595,7 +594,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -614,7 +613,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientImageWithLabels fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -626,7 +625,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -645,7 +644,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientNoMatchingDeviceLabel fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -657,7 +656,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -676,7 +675,7 @@ class TestFirmwareAudit(base.DCManagerTestCase): mock_fw_sysinv_client.side_effect = FakeSysinvClientNoMatchingDeviceId fm = firmware_audit.FirmwareAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.firmware_audit = fm firmware_audit_data = self.get_fw_audit_data(am) @@ -688,5 +687,5 @@ class TestFirmwareAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_FIRMWARE, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_kube_audit_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_kube_audit_manager.py index 4fc80ab92..da4221eee 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_kube_audit_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_kube_audit_manager.py @@ -30,8 +30,7 @@ PREVIOUS_KUBE_VERSION = 'v1.2.3' UPGRADED_KUBE_VERSION = 'v1.2.3-a' -class FakeDCManagerAPI(object): - +class FakeDCManagerStateAPI(object): def __init__(self): self.update_subcloud_availability = mock.MagicMock() self.update_subcloud_endpoint_status = mock.MagicMock() @@ -80,11 +79,12 @@ class TestKubernetesAudit(base.DCManagerTestCase): super(TestKubernetesAudit, self).setUp() self.ctxt = utils.dummy_context() - # Mock the DCManager API - self.fake_dcmanager_api = FakeDCManagerAPI() - p = mock.patch('dcmanager.rpc.client.ManagerClient') - self.mock_dcmanager_api = p.start() - self.mock_dcmanager_api.return_value = self.fake_dcmanager_api + # Mock the DCManager subcloud state API + self.fake_dcmanager_state_api = FakeDCManagerStateAPI() + p = mock.patch('dcmanager.rpc.client.SubcloudStateClient') + self.mock_dcmanager_state_api = p.start() + self.mock_dcmanager_state_api.return_value = \ + self.fake_dcmanager_state_api self.addCleanup(p.stop) # Mock the Audit Worker API @@ -151,17 +151,18 @@ class TestKubernetesAudit(base.DCManagerTestCase): def test_init(self): audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) self.assertIsNotNone(audit) self.assertEqual(self.ctxt, audit.context) - self.assertEqual(self.fake_dcmanager_api, audit.dcmanager_rpc_client) + self.assertEqual(self.fake_dcmanager_state_api, + audit.state_rpc_client) @mock.patch.object(subcloud_audit_manager, 'context') def test_no_kubernetes_audit_data_to_sync(self, mock_context): mock_context.get_admin_context.return_value = self.ctxt audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.kubernetes_audit = audit kubernetes_audit_data = self.get_kube_audit_data(am) @@ -173,14 +174,14 @@ class TestKubernetesAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_KUBERNETES, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(subcloud_audit_manager, 'context') def test_kubernetes_audit_data_out_of_sync_older(self, mock_context): mock_context.get_admin_context.return_value = self.ctxt audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.kubernetes_audit = audit @@ -201,14 +202,14 @@ class TestKubernetesAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_KUBERNETES, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(subcloud_audit_manager, 'context') def test_kubernetes_audit_data_out_of_sync_newer(self, mock_context): mock_context.get_admin_context.return_value = self.ctxt audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.kubernetes_audit = audit @@ -229,7 +230,7 @@ class TestKubernetesAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_KUBERNETES, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(subcloud_audit_manager, 'context') @@ -237,7 +238,7 @@ class TestKubernetesAudit(base.DCManagerTestCase): mock_context): mock_context.get_admin_context.return_value = self.ctxt audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.kubernetes_audit = audit @@ -258,7 +259,7 @@ class TestKubernetesAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_KUBERNETES, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(subcloud_audit_manager, 'context') @@ -268,7 +269,7 @@ class TestKubernetesAudit(base.DCManagerTestCase): # even if the kube versions match mock_context.get_admin_context.return_value = self.ctxt audit = kubernetes_audit.KubernetesAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.kubernetes_audit = audit @@ -293,5 +294,5 @@ class TestKubernetesAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_KUBERNETES, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_patch_audit_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_patch_audit_manager.py index 0e1bcf479..540c791e1 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_patch_audit_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_patch_audit_manager.py @@ -31,8 +31,7 @@ from dcorch.common import consts as dcorch_consts CONF = cfg.CONF -class FakeDCManagerAPI(object): - +class FakeDCManagerStateAPI(object): def __init__(self): self.update_subcloud_availability = mock.MagicMock() self.update_subcloud_endpoint_status = mock.MagicMock() @@ -253,11 +252,12 @@ class TestPatchAudit(base.DCManagerTestCase): super(TestPatchAudit, self).setUp() self.ctxt = utils.dummy_context() - # Mock the DCManager API - self.fake_dcmanager_api = FakeDCManagerAPI() - p = mock.patch('dcmanager.rpc.client.ManagerClient') - self.mock_dcmanager_api = p.start() - self.mock_dcmanager_api.return_value = self.fake_dcmanager_api + # Mock the DCManager subcloud state API + self.fake_dcmanager_state_api = FakeDCManagerStateAPI() + p = mock.patch('dcmanager.rpc.client.SubcloudStateClient') + self.mock_dcmanager_state_api = p.start() + self.mock_dcmanager_state_api.return_value = \ + self.fake_dcmanager_state_api self.addCleanup(p.stop) # Mock the Audit Worker API @@ -277,10 +277,10 @@ class TestPatchAudit(base.DCManagerTestCase): def test_init(self): pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) self.assertIsNotNone(pm) self.assertEqual(self.ctxt, pm.context) - self.assertEqual(self.fake_dcmanager_api, pm.dcmanager_rpc_client) + self.assertEqual(self.fake_dcmanager_state_api, pm.state_rpc_client) @mock.patch.object(patch_audit, 'SysinvClient') @mock.patch.object(patch_audit, 'PatchingClient') @@ -295,7 +295,7 @@ class TestPatchAudit(base.DCManagerTestCase): mock_sysinv_client.side_effect = FakeSysinvClientOneLoad pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.patch_audit = pm @@ -313,7 +313,7 @@ class TestPatchAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_LOAD, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status. \ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status. \ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -326,7 +326,7 @@ class TestPatchAudit(base.DCManagerTestCase): mock_sysinv_client): mock_context.get_admin_context.return_value = self.ctxt pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.patch_audit = pm @@ -374,7 +374,7 @@ class TestPatchAudit(base.DCManagerTestCase): sync_status=consts.SYNC_STATUS_IN_SYNC), ] - self.fake_dcmanager_api.update_subcloud_endpoint_status.\ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status.\ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -387,7 +387,7 @@ class TestPatchAudit(base.DCManagerTestCase): mock_sysinv_client): mock_context.get_admin_context.return_value = self.ctxt pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.patch_audit = pm @@ -408,7 +408,7 @@ class TestPatchAudit(base.DCManagerTestCase): subcloud_name=name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_LOAD, sync_status=consts.SYNC_STATUS_IN_SYNC)] - self.fake_dcmanager_api.update_subcloud_endpoint_status.\ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status.\ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -422,7 +422,7 @@ class TestPatchAudit(base.DCManagerTestCase): mock_sysinv_client): mock_context.get_admin_context.return_value = self.ctxt pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.patch_audit = pm mock_patching_client.side_effect = FakePatchingClientInSync @@ -452,7 +452,7 @@ class TestPatchAudit(base.DCManagerTestCase): endpoint_type=dcorch_consts.ENDPOINT_TYPE_LOAD, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC), ] - self.fake_dcmanager_api.update_subcloud_endpoint_status.\ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status.\ assert_has_calls(expected_calls) @mock.patch.object(patch_audit, 'SysinvClient') @@ -466,7 +466,7 @@ class TestPatchAudit(base.DCManagerTestCase): mock_sysinv_client): mock_context.get_admin_context.return_value = self.ctxt pm = patch_audit.PatchAudit(self.ctxt, - self.fake_dcmanager_api) + self.fake_dcmanager_state_api) am = subcloud_audit_manager.SubcloudAuditManager() am.patch_audit = pm mock_patching_client.side_effect = FakePatchingClientInSync @@ -496,5 +496,5 @@ class TestPatchAudit(base.DCManagerTestCase): endpoint_type=dcorch_consts.ENDPOINT_TYPE_LOAD, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC), ] - self.fake_dcmanager_api.update_subcloud_endpoint_status.\ + self.fake_dcmanager_state_api.update_subcloud_endpoint_status.\ assert_has_calls(expected_calls) diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py index 4aa7ff310..97a24ec9d 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py @@ -31,8 +31,12 @@ from dcmanager.tests import base class FakeDCManagerAPI(object): def __init__(self): - self.update_subcloud_availability = mock.MagicMock() self.update_subcloud_sync_endpoint_type = mock.MagicMock() + + +class FakeDCManagerStateAPI(object): + def __init__(self): + self.update_subcloud_availability = mock.MagicMock() self.update_subcloud_endpoint_status = mock.MagicMock() @@ -239,6 +243,14 @@ class TestAuditWorkerManager(base.DCManagerTestCase): self.mock_dcmanager_api.return_value = self.fake_dcmanager_api self.addCleanup(p.stop) + # Mock the DCManager subcloud state API + self.fake_dcmanager_state_api = FakeDCManagerStateAPI() + p = mock.patch('dcmanager.rpc.client.SubcloudStateClient') + self.mock_dcmanager_state_api = p.start() + self.mock_dcmanager_state_api.return_value = \ + self.fake_dcmanager_state_api + self.addCleanup(p.stop) + # Mock the Audit Worker API self.fake_audit_worker_api = FakeAuditWorkerAPI() p = mock.patch('dcmanager.audit.rpcapi.ManagerAuditWorkerClient') @@ -413,7 +425,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): do_kube_rootca_update_audit) # Verify the subcloud was set to online - self.fake_dcmanager_api.update_subcloud_availability.assert_called_with( + self.fake_dcmanager_state_api.update_subcloud_availability.assert_called_with( mock.ANY, subcloud.name, consts.AVAILABILITY_ONLINE, False, 0) @@ -485,7 +497,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): do_kube_rootca_update_audit) # Verify the subcloud was set to online - self.fake_dcmanager_api.update_subcloud_availability.assert_called_with( + self.fake_dcmanager_state_api.update_subcloud_availability.assert_called_with( mock.ANY, subcloud.name, consts.AVAILABILITY_ONLINE, False, 0) @@ -538,7 +550,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): do_kube_rootca_update_audit=False) # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the _update_subcloud_audit_fail_count is not called @@ -581,7 +593,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): do_kube_rootca_update_audit=False) # Verify the subcloud state was updated even though no change - self.fake_dcmanager_api.update_subcloud_availability.assert_called_with( + self.fake_dcmanager_state_api.update_subcloud_availability.assert_called_with( mock.ANY, subcloud.name, consts.AVAILABILITY_ONLINE, True, None) @@ -661,7 +673,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): self.assertEqual(subcloud.audit_fail_count, audit_fail_count) # Verify the update_subcloud_availability was not called - self.fake_dcmanager_api.update_subcloud_availability.assert_not_called() + self.fake_dcmanager_state_api.update_subcloud_availability.assert_not_called() # Update the DB like dcmanager would do. subcloud = db_api.subcloud_update( @@ -689,7 +701,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): self.assertEqual(subcloud.audit_fail_count, audit_fail_count) # Verify the update_subcloud_availability was not called - self.fake_dcmanager_api.update_subcloud_availability.assert_not_called() + self.fake_dcmanager_state_api.update_subcloud_availability.assert_not_called() # Verify alarm update is called only once self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with( @@ -754,7 +766,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): do_kube_rootca_update_audit=do_kube_rootca_update_audit) # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the _update_subcloud_audit_fail_count is not called @@ -831,7 +843,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): subcloud.audit_fail_count, audit_fail_count + 1) # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the openstack endpoints were not updated @@ -898,7 +910,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): False) # do_kube_rootca_audit # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the _update_subcloud_audit_fail_count is not called @@ -960,7 +972,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): False) # do_kube_rootca_update_audit # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the _update_subcloud_audit_fail_count is not called @@ -1021,7 +1033,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): False) # do_kube_rootca_update_audit # Verify the subcloud state was not updated - self.fake_dcmanager_api.update_subcloud_availability.\ + self.fake_dcmanager_state_api.update_subcloud_availability.\ assert_not_called() # Verify the _update_subcloud_audit_fail_count is not called diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_service.py b/distributedcloud/dcmanager/tests/unit/manager/test_service.py index 6dfd8c158..19964a902 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_service.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_service.py @@ -61,10 +61,6 @@ class TestDCManagerService(base.DCManagerTestCase): self.assertEqual(self.service_obj.host, 'localhost') self.assertEqual(self.service_obj.topic, 'dcmanager') - def test_init_tgm(self): - self.service_obj.init_tgm() - self.assertIsNotNone(self.service_obj.TG) - @mock.patch.object(service, 'SubcloudManager') def test_init_managers(self, mock_subcloud_manager): self.service_obj.init_managers() @@ -80,7 +76,6 @@ class TestDCManagerService(base.DCManagerTestCase): @mock.patch.object(service, 'SubcloudManager') def test_add_subcloud(self, mock_subcloud_manager): - self.service_obj.init_tgm() self.service_obj.init_managers() self.service_obj.add_subcloud( self.context, payload={'name': 'testname'}) @@ -89,7 +84,6 @@ class TestDCManagerService(base.DCManagerTestCase): @mock.patch.object(service, 'SubcloudManager') def test_delete_subcloud(self, mock_subcloud_manager): - self.service_obj.init_tgm() self.service_obj.init_managers() self.service_obj.delete_subcloud( self.context, subcloud_id=1) @@ -98,7 +92,6 @@ class TestDCManagerService(base.DCManagerTestCase): @mock.patch.object(service, 'SubcloudManager') def test_update_subcloud(self, mock_subcloud_manager): - self.service_obj.init_tgm() self.service_obj.init_managers() self.service_obj.update_subcloud( self.context, subcloud_id=1, management_state='testmgmtstatus') diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py index 0350dc65a..c2c342326 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py @@ -29,6 +29,7 @@ from dcmanager.common import exceptions from dcmanager.common import utils as cutils from dcmanager.db.sqlalchemy import api as db_api from dcmanager.manager import subcloud_manager +from dcmanager.state import subcloud_state_manager from dcmanager.tests import base from dcmanager.tests.unit.common import fake_subcloud from dcmanager.tests import utils @@ -37,12 +38,17 @@ from tsconfig.tsconfig import SW_VERSION class FakeDCManagerAuditAPI(object): - def __init__(self): self.trigger_subcloud_audits = mock.MagicMock() self.trigger_subcloud_patch_load_audits = mock.MagicMock() +class FakeDCManagerStateAPI(object): + def __init__(self): + self.update_subcloud_availability = mock.MagicMock() + self.update_subcloud_endpoint_status = mock.MagicMock() + + class FakeDCOrchAPI(object): def __init__(self): self.update_subcloud_states = mock.MagicMock() @@ -233,6 +239,14 @@ class TestSubcloudManager(base.DCManagerTestCase): self.fake_dcmanager_audit_api self.addCleanup(p.stop) + # Mock the DCManager subcloud state API + self.fake_dcmanager_state_api = FakeDCManagerStateAPI() + p = mock.patch('dcmanager.rpc.client.SubcloudStateClient') + self.mock_dcmanager_state_api = p.start() + self.mock_dcmanager_state_api.return_value = \ + self.fake_dcmanager_state_api + self.addCleanup(p.stop) + # Mock the DCOrch API self.fake_dcorch_api = FakeDCOrchAPI() p = mock.patch('dcorch.rpc.client.EngineClient') @@ -693,7 +707,7 @@ class TestSubcloudManager(base.DCManagerTestCase): self.assertEqual(status.sync_status, consts.SYNC_STATUS_UNKNOWN) # Update/verify each status with the default sync state: out-of-sync - sm = subcloud_manager.SubcloudManager() + ssm = subcloud_state_manager.SubcloudStateManager() for endpoint in [dcorch_consts.ENDPOINT_TYPE_PLATFORM, dcorch_consts.ENDPOINT_TYPE_IDENTITY, dcorch_consts.ENDPOINT_TYPE_PATCHING, @@ -701,7 +715,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_NFV, dcorch_consts.ENDPOINT_TYPE_DC_CERT]: # Update - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint) @@ -720,7 +734,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_FM, dcorch_consts.ENDPOINT_TYPE_NFV, dcorch_consts.ENDPOINT_TYPE_DC_CERT]: - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_IN_SYNC) @@ -734,7 +748,7 @@ class TestSubcloudManager(base.DCManagerTestCase): # Attempt to update each status to be unknown for an offline/unmanaged # subcloud. This is allowed. - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=None, sync_status=consts.SYNC_STATUS_UNKNOWN) @@ -753,7 +767,7 @@ class TestSubcloudManager(base.DCManagerTestCase): # Attempt to update each status to be out-of-sync for an # offline/unmanaged subcloud. Exclude one endpoint. This is allowed. - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=None, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, @@ -795,7 +809,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_PATCHING, dcorch_consts.ENDPOINT_TYPE_FM, dcorch_consts.ENDPOINT_TYPE_NFV]: - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_IN_SYNC) @@ -810,7 +824,7 @@ class TestSubcloudManager(base.DCManagerTestCase): # Attempt to update dc-cert status to be in-sync for an # online/unmanaged subcloud. This is allowed. Verify the change. endpoint = dcorch_consts.ENDPOINT_TYPE_DC_CERT - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_IN_SYNC) @@ -840,7 +854,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_FM, dcorch_consts.ENDPOINT_TYPE_NFV, dcorch_consts.ENDPOINT_TYPE_DC_CERT]: - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_IN_SYNC) @@ -860,7 +874,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_FM, dcorch_consts.ENDPOINT_TYPE_NFV, dcorch_consts.ENDPOINT_TYPE_DC_CERT]: - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC) @@ -888,7 +902,7 @@ class TestSubcloudManager(base.DCManagerTestCase): mock_dcmanager_api = p.start() mock_dcmanager_api.return_value = fake_dcmanager_cermon_api - sm = subcloud_manager.SubcloudManager() + ssm = subcloud_state_manager.SubcloudStateManager() db_api.subcloud_update(self.ctx, subcloud.id, management_state=consts.MANAGEMENT_MANAGED) @@ -904,8 +918,8 @@ class TestSubcloudManager(base.DCManagerTestCase): self.assertIsNotNone(status) self.assertEqual(status.sync_status, consts.SYNC_STATUS_UNKNOWN) - sm.update_subcloud_availability(self.ctx, subcloud.name, - consts.AVAILABILITY_ONLINE) + ssm.update_subcloud_availability(self.ctx, subcloud.name, + consts.AVAILABILITY_ONLINE) updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1') # Verify the subcloud was set to online @@ -935,7 +949,7 @@ class TestSubcloudManager(base.DCManagerTestCase): mock_dcmanager_api = p.start() mock_dcmanager_api.return_value = fake_dcmanager_cermon_api - sm = subcloud_manager.SubcloudManager() + ssm = subcloud_state_manager.SubcloudStateManager() # Note that we have intentionally left the subcloud as "unmanaged" @@ -951,8 +965,8 @@ class TestSubcloudManager(base.DCManagerTestCase): self.assertIsNotNone(status) self.assertEqual(status.sync_status, consts.SYNC_STATUS_UNKNOWN) - sm.update_subcloud_availability(self.ctx, subcloud.name, - consts.AVAILABILITY_ONLINE) + ssm.update_subcloud_availability(self.ctx, subcloud.name, + consts.AVAILABILITY_ONLINE) updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1') # Verify the subcloud was set to online @@ -977,7 +991,7 @@ class TestSubcloudManager(base.DCManagerTestCase): management_state=consts.MANAGEMENT_MANAGED, availability_status=consts.AVAILABILITY_ONLINE) - sm = subcloud_manager.SubcloudManager() + ssm = subcloud_state_manager.SubcloudStateManager() # create sync statuses for endpoints and set them to in-sync for endpoint in [dcorch_consts.ENDPOINT_TYPE_PLATFORM, @@ -987,7 +1001,7 @@ class TestSubcloudManager(base.DCManagerTestCase): dcorch_consts.ENDPOINT_TYPE_NFV]: db_api.subcloud_status_create( self.ctx, subcloud.id, endpoint) - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=consts.SYNC_STATUS_IN_SYNC) @@ -999,9 +1013,9 @@ class TestSubcloudManager(base.DCManagerTestCase): # Audit fails once audit_fail_count = 1 - sm.update_subcloud_availability(self.ctx, subcloud.name, - availability_status=None, - audit_fail_count=audit_fail_count) + ssm.update_subcloud_availability(self.ctx, subcloud.name, + availability_status=None, + audit_fail_count=audit_fail_count) # Verify the subclcoud availability was not updated updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1') self.assertEqual(updated_subcloud.availability_status, @@ -1014,9 +1028,9 @@ class TestSubcloudManager(base.DCManagerTestCase): # Audit fails again audit_fail_count = audit_fail_count + 1 - sm.update_subcloud_availability(self.ctx, subcloud.name, - consts.AVAILABILITY_OFFLINE, - audit_fail_count=audit_fail_count) + ssm.update_subcloud_availability(self.ctx, subcloud.name, + consts.AVAILABILITY_OFFLINE, + audit_fail_count=audit_fail_count) # Verify the subclcoud availability was updated updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1') @@ -1039,7 +1053,7 @@ class TestSubcloudManager(base.DCManagerTestCase): subcloud = self.create_subcloud_static(self.ctx, name='subcloud1') self.assertIsNotNone(subcloud) - sm = subcloud_manager.SubcloudManager() + ssm = subcloud_state_manager.SubcloudStateManager() # Set the subcloud to online/managed db_api.subcloud_update(self.ctx, subcloud.id, @@ -1060,7 +1074,7 @@ class TestSubcloudManager(base.DCManagerTestCase): consts.SYNC_STATUS_UNKNOWN]: # Update identity to the original status - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=original_sync_status) @@ -1070,7 +1084,7 @@ class TestSubcloudManager(base.DCManagerTestCase): self.fake_dcmanager_audit_api.trigger_subcloud_patch_load_audits.call_count # Update identity to new status and get the count of the trigger again - sm.update_subcloud_endpoint_status( + ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=new_sync_status) diff --git a/distributedcloud/dcorch/api/proxy/apps/controller.py b/distributedcloud/dcorch/api/proxy/apps/controller.py index ff2ab8e43..066526931 100644 --- a/distributedcloud/dcorch/api/proxy/apps/controller.py +++ b/distributedcloud/dcorch/api/proxy/apps/controller.py @@ -389,7 +389,7 @@ class SysinvAPIController(APIController): def __init__(self, app, conf): super(SysinvAPIController, self).__init__(app, conf) - self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient() + self.dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() self.response_hander_map = { self.ENDPOINT_TYPE: self._process_response } @@ -423,7 +423,7 @@ class SysinvAPIController(APIController): # Send a RPC to dcmanager LOG.info("Send RPC to dcmanager to set: %s sync status to: %s" % (endpoint_type, sync_status)) - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.dcmanager_state_rpc_client.update_subcloud_endpoint_status( self.ctxt, endpoint_type=endpoint_type, sync_status=sync_status) diff --git a/distributedcloud/dcorch/api/proxy/apps/patch.py b/distributedcloud/dcorch/api/proxy/apps/patch.py index f5c5c7d25..c603a7d45 100644 --- a/distributedcloud/dcorch/api/proxy/apps/patch.py +++ b/distributedcloud/dcorch/api/proxy/apps/patch.py @@ -64,7 +64,7 @@ class PatchAPIController(Middleware): super(PatchAPIController, self).__init__(app) self.ctxt = context.get_admin_context() self._default_dispatcher = APIDispatcher(app) - self.rpc_client = dcmanager_rpc_client.ManagerClient() + self.dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() self.response_hander_map = { proxy_consts.PATCH_ACTION_UPLOAD: self.patch_upload_req, proxy_consts.PATCH_ACTION_UPLOAD_DIR: self.patch_upload_dir_req, @@ -186,7 +186,7 @@ class PatchAPIController(Middleware): # Send a RPC to dcmanager LOG.info("Send RPC to dcmanager to set patching sync status to " "unknown") - self.rpc_client.update_subcloud_endpoint_status( + self.dcmanager_state_rpc_client.update_subcloud_endpoint_status( self.ctxt, endpoint_type=self.ENDPOINT_TYPE, sync_status=dcmanager_consts.SYNC_STATUS_UNKNOWN) diff --git a/distributedcloud/dcorch/cmd/engine.py b/distributedcloud/dcorch/cmd/engine.py index 704cef79a..5f49d4307 100644 --- a/distributedcloud/dcorch/cmd/engine.py +++ b/distributedcloud/dcorch/cmd/engine.py @@ -43,6 +43,9 @@ def main(): messaging.setup() dmanager_messaging.setup() + LOG.info("Launching dcorch-engine, host=%s, workers=%s ...", + cfg.CONF.host, cfg.CONF.workers) + srv = engine.EngineService(cfg.CONF.host, consts.TOPIC_ORCH_ENGINE) launcher = service.launch(cfg.CONF, diff --git a/distributedcloud/dcorch/engine/generic_sync_manager.py b/distributedcloud/dcorch/engine/generic_sync_manager.py index e49dba98d..d8db61739 100644 --- a/distributedcloud/dcorch/engine/generic_sync_manager.py +++ b/distributedcloud/dcorch/engine/generic_sync_manager.py @@ -75,6 +75,7 @@ class GenericSyncManager(object): self.create_sync_objects(sc.region_name, sc.capabilities) LOG.info('Engine id:(%s) create_sync_objects for' 'subcloud:%s.' % (self.engine_id, sc.region_name)) + eventlet.sleep(0) # cooperative yield def create_sync_objects(self, subcloud_name, capabilities): """Create sync object objects for the subcloud diff --git a/distributedcloud/dcorch/engine/service.py b/distributedcloud/dcorch/engine/service.py index 8fb6ca275..2ccd652de 100644 --- a/distributedcloud/dcorch/engine/service.py +++ b/distributedcloud/dcorch/engine/service.py @@ -108,6 +108,7 @@ class EngineService(service.Service): self.TG.start(self.ism.initial_sync_thread, self.engine_id) def start(self): + LOG.info("Starting %s", self.__class__.__name__) self.engine_id = uuidutils.generate_uuid() target = oslo_messaging.Target(version=self.rpc_api_version, server=self.host, @@ -303,9 +304,10 @@ class EngineService(service.Service): # Stop RPC connection to prevent new requests LOG.debug(_("Attempting to stop engine service...")) try: - self._rpc_server.stop() - self._rpc_server.wait() - LOG.info('Engine service stopped successfully') + if self._rpc_server: + self._rpc_server.stop() + self._rpc_server.wait() + LOG.info('Engine service stopped successfully') except Exception as ex: LOG.error('Failed to stop engine service: %s', six.text_type(ex)) @@ -313,7 +315,9 @@ class EngineService(service.Service): def stop(self): self._stop_rpc_server() - self.TG.stop() + if self.TG: + self.TG.stop() + # Terminate the engine process LOG.info("All threads were gone, terminating engine") super(EngineService, self).stop() diff --git a/distributedcloud/dcorch/engine/sync_thread.py b/distributedcloud/dcorch/engine/sync_thread.py index d8c6fa0c8..fc3696506 100644 --- a/distributedcloud/dcorch/engine/sync_thread.py +++ b/distributedcloud/dcorch/engine/sync_thread.py @@ -76,7 +76,7 @@ class SyncThread(object): self.log_extra = { "instance": self.subcloud_name + ": "} - self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient() + self.dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient() self.sc_admin_session = None self.admin_session = None @@ -282,7 +282,7 @@ class SyncThread(object): LOG.info("{}: set_sync_status {}".format(self.subcloud_name, sync_status), extra=self.log_extra) - self.dcmanager_rpc_client.update_subcloud_endpoint_status( + self.dcmanager_state_rpc_client.update_subcloud_endpoint_status( self.ctxt, self.subcloud_name, self.endpoint_type, sync_status) diff --git a/distributedcloud/ocf/dcmanager-state b/distributedcloud/ocf/dcmanager-state new file mode 100644 index 000000000..19bb127e2 --- /dev/null +++ b/distributedcloud/ocf/dcmanager-state @@ -0,0 +1,327 @@ +#!/bin/sh +# OpenStack DC Manager State Service (dcmanager-state) +# +# Description: +# Manages an OpenStack DC Manager State Service (dcmanager-state) +# process as an HA resource +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# +# See usage() function below for more details ... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_config +# OCF_RESKEY_user +# OCF_RESKEY_pid +# OCF_RESKEY_additional_parameters +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Fill in some defaults if no values are specified + +OCF_RESKEY_binary_default="/usr/bin/dcmanager-state" +OCF_RESKEY_config_default="/etc/dcmanager/dcmanager.conf" +OCF_RESKEY_user_default="root" +OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} + +export TMPDIR=/var/run/dcmanager + +####################################################################### + +usage() { + cat < + + +1.0 + + +Resource agent for the DC Manager service (dcmanager-state) + +Manages the OpenStack DC Manager State Service (dcmanager-state) + + + + +Location of the DC Manager State Service binary (dcmanager-state) + +DC Manager State Service binary (dcmanager-state) + + + + + +Location of the DC Manager State Service (dcmanager-state) configuration file + +DC Manager State Service (dcmanager-state registry) config file + + + + + +User running DC Manager State Service (dcmanager-state) + +DC Manager State Service (dcmanager-state) user + + + + + +The pid file to use for this DC Manager State Service (dcmanager-state) instance + +DC Manager State Service (dcmanager-state) pid file + + + + + +Additional parameters to pass on to the OpenStack NovaAPI (dcmanager-state) + +Additional parameters for dcmanager-state + + + + + + + + + + + + + + +END +} + +####################################################################### +# Functions invoked by resource manager actions + +dcmanager_state_validate() { + local rc + + check_binary $OCF_RESKEY_binary + check_binary curl + check_binary tr + check_binary grep + check_binary cut + check_binary head + + # A config file on shared storage that is not available + # during probes is OK. + if [ ! -f $OCF_RESKEY_config ]; then + if ! ocf_is_probe; then + ocf_log err "Config $OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + ocf_log_warn "Config $OCF_RESKEY_config not available during a probe" + fi + + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + true +} + +dcmanager_state_status() { + local pid + local rc + + if [ ! -f $OCF_RESKEY_pid ]; then + ocf_log info "DC Manager State Service (dcmanager-state) is not running" + return $OCF_NOT_RUNNING + else + pid=`cat $OCF_RESKEY_pid` + fi + + ocf_run -warn kill -s 0 $pid + rc=$? + if [ $rc -eq 0 ]; then + return $OCF_SUCCESS + else + ocf_log info "Old PID file found, but DC Manager State Service (dcmanager-state) is not running" + rm -f $OCF_RESKEY_pid + return $OCF_NOT_RUNNING + fi +} + +dcmanager_state_monitor() { + local rc + + dcmanager_state_status + rc=$? + + # If status returned anything but success, return that immediately + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + # Further verify the service availibility. + + ocf_log debug "DC Manager State Service (dcmanager-state) monitor succeeded" + return $OCF_SUCCESS +} + +dcmanager_state_start() { + local rc + + dcmanager_state_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "DC Manager State Service (dcmanager-state) already running" + return $OCF_SUCCESS + fi + + # Change the working dir to /, to be sure it's accesible + cd / + + # run the actual dcmanager-state daemon. Don't use ocf_run as we're sending the tool's output + # straight to /dev/null anyway and using ocf_run would break stdout-redirection here. + su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \ + $OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid + + # Spin waiting for the server to come up. + # Let the CRM/LRM time us out if required + while true; do + dcmanager_state_monitor + rc=$? + [ $rc -eq $OCF_SUCCESS ] && break + if [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_log err "DC Manager State Service (dcmanager-state) start failed" + exit $OCF_ERR_GENERIC + fi + sleep 1 + done + + ocf_log info "DC Manager State Service (dcmanager-state) started" + return $OCF_SUCCESS +} + +dcmanager_state_confirm_stop() { + local my_bin + local my_processes + + my_binary=`which ${OCF_RESKEY_binary}` + my_processes=`pgrep -l -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)"` + + if [ -n "${my_processes}" ] + then + ocf_log info "About to SIGKILL the following: ${my_processes}" + pkill -KILL -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)" + fi +} + +dcmanager_state_stop() { + local rc + local pid + + dcmanager_state_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_log info "DC Manager State Service (dcmanager-state) already stopped" + dcmanager_state_confirm_stop + return $OCF_SUCCESS + fi + + # Try SIGTERM + pid=`cat $OCF_RESKEY_pid` + ocf_run kill -s TERM $pid + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "DC Manager State Service (dcmanager-state) couldn't be stopped" + dcmanager_state_confirm_stop + exit $OCF_ERR_GENERIC + fi + + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + fi + count=0 + while [ $count -lt $shutdown_timeout ]; do + dcmanager_state_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + break + fi + count=`expr $count + 1` + sleep 1 + ocf_log debug "DC Manager State Service (dcmanager-state) still hasn't stopped yet. Waiting ..." + done + + dcmanager_state_status + rc=$? + if [ $rc -ne $OCF_NOT_RUNNING ]; then + # SIGTERM didn't help either, try SIGKILL + ocf_log info "DC Manager State Service (dcmanager-state) failed to stop after ${shutdown_timeout}s \ + using SIGTERM. Trying SIGKILL ..." + ocf_run kill -s KILL $pid + fi + dcmanager_state_confirm_stop + + ocf_log info "DC Manager State Service (dcmanager-state) stopped" + + rm -f $OCF_RESKEY_pid + + return $OCF_SUCCESS +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +# Anything except meta-data and help must pass validation +dcmanager_state_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) dcmanager_state_start;; + stop) dcmanager_state_stop;; + status) dcmanager_state_status;; + monitor) dcmanager_state_monitor;; + validate-all) ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + diff --git a/distributedcloud/setup.cfg b/distributedcloud/setup.cfg index d1f450629..53a7b5bd1 100644 --- a/distributedcloud/setup.cfg +++ b/distributedcloud/setup.cfg @@ -34,6 +34,7 @@ console_scripts = dcmanager-orchestrator = dcmanager.cmd.orchestrator:main dcmanager-manager = dcmanager.cmd.manager:main dcmanager-manage = dcmanager.cmd.manage:main + dcmanager-state = dcmanager.cmd.state:main dcorch-api = dcorch.cmd.api:main dcorch-engine = dcorch.cmd.engine:main dcorch-manage = dcorch.cmd.manage:main