diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..beb9b3c4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,102 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# IDEA IDE +.idea/ diff --git a/drydock_provisioner/config.py b/drydock_provisioner/config.py index a8c501e1..2a60426f 100644 --- a/drydock_provisioner/config.py +++ b/drydock_provisioner/config.py @@ -27,8 +27,7 @@ class DrydockConfig(object): node_driver = { 'maasdriver': { - 'api_key': 'KTMHgA42cNSMnfmJ82:cdg4yQUhp542aHsCTV:7Dc2KB9hQpWq3LfQAAAKAj6wdg22yWxZ', - 'api_url': 'http://localhost:5240/MAAS/api/2.0/', + }, } diff --git a/drydock_provisioner/control/api.py b/drydock_provisioner/control/api.py index 7766cf7f..a6d832f9 100644 --- a/drydock_provisioner/control/api.py +++ b/drydock_provisioner/control/api.py @@ -31,17 +31,21 @@ def start_api(state_manager=None, ingester=None, orchestrator=None): control_api = falcon.API(request_type=DrydockRequest, middleware=[AuthMiddleware(), ContextMiddleware(), LoggingMiddleware()]) + # v1.0 of Drydock API + v1_0_routes = [ # API for managing orchestrator tasks - control_api.add_route('/tasks', TasksResource(state_manager=state_manager, orchestrator=orchestrator)) - control_api.add_route('/tasks/{task_id}', TaskResource(state_manager=state_manager)) + ('/tasks', TasksResource(state_manager=state_manager, orchestrator=orchestrator)), + ('/tasks/{task_id}', TaskResource(state_manager=state_manager)), # API for managing site design data - control_api.add_route('/designs', DesignsResource(state_manager=state_manager)) - control_api.add_route('/designs/{design_id}', DesignResource(state_manager=state_manager, orchestrator=orchestrator)) - control_api.add_route('/designs/{design_id}/parts', DesignsPartsResource(state_manager=state_manager, ingester=ingester)) - control_api.add_route('/designs/{design_id}/parts/{kind}', DesignsPartsKindsResource(state_manager=state_manager)) + ('/designs', DesignsResource(state_manager=state_manager)), + ('/designs/{design_id}', DesignResource(state_manager=state_manager, orchestrator=orchestrator)), + ('/designs/{design_id}/parts', DesignsPartsResource(state_manager=state_manager, ingester=ingester)), + ('/designs/{design_id}/parts/{kind}', DesignsPartsKindsResource(state_manager=state_manager)), + ('/designs/{design_id}/parts/{kind}/{name}', DesignsPartResource(state_manager=state_manager, orchestrator=orchestrator)) + ] - control_api.add_route('/designs/{design_id}/parts/{kind}/{name}', - DesignsPartResource(state_manager=state_manager, orchestrator=orchestrator)) + for path, res in v1_0_routes: + control_api.add_route('/api/v1.0' + path, res) return control_api diff --git a/drydock_provisioner/control/base.py b/drydock_provisioner/control/base.py index ac1e787a..e5d39a15 100644 --- a/drydock_provisioner/control/base.py +++ b/drydock_provisioner/control/base.py @@ -118,6 +118,7 @@ class DrydockRequestContext(object): self.user = None self.roles = ['anyone'] self.request_id = str(uuid.uuid4()) + self.external_marker = None def set_log_level(self, level): @@ -138,7 +139,7 @@ class DrydockRequestContext(object): if x != role] def set_external_marker(self, marker): - self.external_marker = str(marker)[:32] + self.external_marker = str(marker)[:20] class DrydockRequest(request.Request): context_type = DrydockRequestContext \ No newline at end of file diff --git a/drydock_provisioner/control/middleware.py b/drydock_provisioner/control/middleware.py index 7a25acd4..b3e1515b 100644 --- a/drydock_provisioner/control/middleware.py +++ b/drydock_provisioner/control/middleware.py @@ -72,11 +72,9 @@ class ContextMiddleware(object): elif requested_logging == 'INFO': ctx.set_log_level('INFO') - ctx.req_id = str(uuid.uuid4()) - ext_marker = req.get_header('X-Context-Marker') - - ctx.external_ctx = ext_marker if ext_marker is not None else '' + + ctx.set_external_marker(ext_marker if ext_marker is not None else '') class LoggingMiddleware(object): @@ -88,7 +86,7 @@ class LoggingMiddleware(object): extra = { 'user': ctx.user, 'req_id': ctx.req_id, - 'external_ctx': ctx.external_ctx, + 'external_ctx': ctx.external_marker, } resp.append_header('X-Drydock-Req', ctx.req_id) self.logger.info("%s - %s" % (req.uri, resp.status), extra=extra) diff --git a/drydock_provisioner/control/readme.md b/drydock_provisioner/control/readme.md index b6da6637..68f5c48c 100644 --- a/drydock_provisioner/control/readme.md +++ b/drydock_provisioner/control/readme.md @@ -3,28 +3,30 @@ This is the external facing API service to control the rest of Drydock and query Drydock-managed data. -## Endpoints ## -### /tasks ### +## v1.0 Endpoints ## + +### /api/v1.0/tasks ### POST - Create a new orchestration task and submit it for execution GET - Get status of a task DELETE - Cancel execution of a task if permitted -### /designs ### + +### /api/v1.0/designs ### POST - Create a new site design so design parts can be added -### /designs/{id} +### /api/v1.0/designs/{id} GET - Get a current design if available. Param 'source=compiled' to calculate the inheritance chain and compile the effective design. -### /designs/{id}/parts +### /api/v1.0/designs/{id}/parts POST - Submit a new design part to be ingested and added to this design GET - View a currently defined design part PUT - Replace an existing design part *Not Implemented* -### /designs/{id}/parts/{kind}/{name} +### /api/v1.0/designs/{id}/parts/{kind}/{name} GET - View a single design part. param 'source=compiled' to calculate the inheritance chain and compile the effective configuration for the design part. \ No newline at end of file diff --git a/drydock_provisioner/drivers/node/__init__.py b/drydock_provisioner/drivers/node/__init__.py index 48802905..a28c7cec 100644 --- a/drydock_provisioner/drivers/node/__init__.py +++ b/drydock_provisioner/drivers/node/__init__.py @@ -28,6 +28,7 @@ class NodeDriver(ProviderDriver): hd_fields.OrchestratorAction.CreateStorageTemplate, hd_fields.OrchestratorAction.CreateBootMedia, hd_fields.OrchestratorAction.PrepareHardwareConfig, + hd_fields.OrchestratorAction.IdentifyNode, hd_fields.OrchestratorAction.ConfigureHardware, hd_fields.OrchestratorAction.InterrogateNode, hd_fields.OrchestratorAction.ApplyNodeNetworking, diff --git a/drydock_provisioner/drivers/node/maasdriver/driver.py b/drydock_provisioner/drivers/node/maasdriver/driver.py index c6d0cc9d..205e68aa 100644 --- a/drydock_provisioner/drivers/node/maasdriver/driver.py +++ b/drydock_provisioner/drivers/node/maasdriver/driver.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time +import logging + import drydock_provisioner.error as errors import drydock_provisioner.config as config import drydock_provisioner.drivers as drivers @@ -22,6 +25,7 @@ from .api_client import MaasRequestFactory import drydock_provisioner.drivers.node.maasdriver.models.fabric as maas_fabric import drydock_provisioner.drivers.node.maasdriver.models.vlan as maas_vlan import drydock_provisioner.drivers.node.maasdriver.models.subnet as maas_subnet +import drydock_provisioner.drivers.node.maasdriver.models.machine as maas_machine class MaasNodeDriver(NodeDriver): @@ -34,6 +38,8 @@ class MaasNodeDriver(NodeDriver): self.config = config.DrydockConfig.node_driver[self.driver_key] + self.logger = logging.getLogger('drydock.nodedriver.maasdriver') + def execute_task(self, task_id): task = self.state_manager.get_task(task_id) @@ -104,6 +110,9 @@ class MaasNodeDriver(NodeDriver): site_design = self.orchestrator.get_effective_site(design_id) if task.action == hd_fields.OrchestratorAction.CreateNetworkTemplate: + + self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Running) + subtask = self.orchestrator.create_task(task_model.DriverTask, parent_task_id=task.get_id(), design_id=design_id, action=task.action, site_name=task.site_name, @@ -111,8 +120,14 @@ class MaasNodeDriver(NodeDriver): runner = MaasTaskRunner(state_manager=self.state_manager, orchestrator=self.orchestrator, task_id=subtask.get_id(),config=self.config) + + self.logger.info("Starting thread for task %s to create network templates" % (subtask.get_id())) + runner.start() + # TODO Figure out coherent system for putting all the timeouts in + # the config + runner.join(timeout=120) if runner.is_alive(): @@ -120,18 +135,89 @@ class MaasNodeDriver(NodeDriver): 'retry': False, 'detail': 'MaaS Network creation timed-out' } + + self.logger.warn("Thread for task %s timed out after 120s" % (subtask.get_id())) self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Complete, result=hd_fields.ActionResult.Failure, result_detail=result) else: subtask = self.state_manager.get_task(subtask.get_id()) + + self.logger.info("Thread for task %s completed - result %s" % (subtask.get_id(), subtask.get_result())) self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Complete, result=subtask.get_result()) return + elif task.action == hd_fields.OrchestratorAction.IdentifyNode: + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Running) + + subtasks = [] + + result_detail = { + 'detail': [] + } + + for n in task.node_list: + subtask = self.orchestrator.create_task(task_model.DriverTask, + parent_task_id=task.get_id(), design_id=design_id, + action=hd_fields.OrchestratorAction.IdentifyNode, + site_name=task.site_name, + task_scope={'site': task.site_name, 'node_names': [n]}) + runner = MaasTaskRunner(state_manager=self.state_manager, + orchestrator=self.orchestrator, + task_id=subtask.get_id(),config=self.config) + + self.logger.info("Starting thread for task %s to identify node %s" % (subtask.get_id(), n)) + + runner.start() + subtasks.append(subtask.get_id()) + + running_subtasks = len(subtasks) + attempts = 0 + worked = failed = False + + #TODO Add timeout to config + while running_subtasks > 0 and attempts < 3: + for t in subtasks: + subtask = self.state_manager.get_task(t) + + if subtask.status == hd_fields.TaskStatus.Complete: + self.logger.info("Task %s to identify node %s complete - status %s" % + (subtask.get_id(), n, subtask.get_result())) + + result_detail['detail'].extend(subtask.result_detail['detail']) + running_subtasks = running_subtasks - 1 + + if subtask.result in [hd_fields.ActionResult.Success, + hd_fields.ActionResult.PartialSuccess]: + worked = True + elif subtask.result in [hd_fields.ActionResult.Failure, + hd_fields.ActionResult.PartialSuccess]: + failed = True + + time.sleep(1 * 60) + attempts = attempts + 1 + + if running_subtasks > 0: + self.logger.warn("Time out for task %s before all subtask threads complete" % (task.get_id())) + result = hd_fields.ActionResult.DependentFailure + result_detail['detail'].append('Some subtasks did not complete before the timeout threshold') + if worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + else: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) + class MaasTaskRunner(drivers.DriverTaskRunner): def __init__(self, config=None, **kwargs): @@ -139,6 +225,8 @@ class MaasTaskRunner(drivers.DriverTaskRunner): self.driver_config = config + self.logger = logging.getLogger('drydock.nodedriver.maasdriver') + def execute_task(self): task_action = self.task.action @@ -314,4 +402,48 @@ class MaasTaskRunner(drivers.DriverTaskRunner): self.orchestrator.task_field_update(self.task.get_id(), status=hd_fields.TaskStatus.Complete, result=action_result, - result_detail=result_detail) \ No newline at end of file + result_detail=result_detail) + elif task_action == hd_fields.OrchestratorAction.IdentifyNode: + try: + machine_list = maas_machine.Machines(self.maas_client) + machine_list.refresh() + except: + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error accessing MaaS Machines API', 'retry': True}) + return + + nodes = self.task.node_list + + result_detail = {'detail': []} + + worked = failed = False + + for n in nodes: + try: + node = site_design.get_baremetal_node(n) + machine = machine_list.identify_baremetal_node(node) + if machine is not None: + worked = True + result_detail['detail'].append("Node %s identified in MaaS" % n) + else: + failed = True + result_detail['detail'].append("Node %s not found in MaaS" % n) + except Exception as ex: + failed = True + result_detail['detail'].append("Error identifying node %s: %s" % (n, str(ex))) + + result = None + if worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + elif failed: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) + diff --git a/drydock_provisioner/drivers/node/maasdriver/models/base.py b/drydock_provisioner/drivers/node/maasdriver/models/base.py index fb033eec..c08644bb 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/base.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/base.py @@ -13,6 +13,7 @@ # limitations under the License. import json import re +import logging import drydock_provisioner.error as errors """ @@ -28,6 +29,7 @@ class ResourceBase(object): def __init__(self, api_client, **kwargs): self.api_client = api_client + self.logger = logging.getLogger('drydock.drivers.maasdriver') for f in self.fields: if f in kwargs.keys(): @@ -143,13 +145,15 @@ class ResourceBase(object): return i -""" -A collection of MaaS resources. - -Rather than a simple list, we will key the collection on resource -ID for more efficient access. -""" class ResourceCollectionBase(object): + """ + A collection of MaaS resources. + + Rather than a simple list, we will key the collection on resource + ID for more efficient access. + + :param api_client: An instance of api_client.MaasRequestFactory + """ collection_url = '' collection_resource = ResourceBase @@ -157,12 +161,14 @@ class ResourceCollectionBase(object): def __init__(self, api_client): self.api_client = api_client self.resources = {} + self.logger = logging.getLogger('drydock.drivers.maasdriver') - """ - Parse URL for placeholders and replace them with current - instance values - """ def interpolate_url(self): + """ + Parse URL for placeholders and replace them with current + instance values + """ + pattern = '\{([a-z_]+)\}' regex = re.compile(pattern) start = 0 diff --git a/drydock_provisioner/drivers/node/maasdriver/models/interface.py b/drydock_provisioner/drivers/node/maasdriver/models/interface.py new file mode 100644 index 00000000..5c257279 --- /dev/null +++ b/drydock_provisioner/drivers/node/maasdriver/models/interface.py @@ -0,0 +1,34 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import drydock_provisioner.drivers.node.maasdriver.models.base as model_base + +class Interface(model_base.ResourceBase): + + resource_url = 'nodes/{system_id}/interfaces/{resource_id}/' + fields = ['resource_id', 'system_id', 'name', 'type', 'mac_address', 'vlan', + 'links', 'effective_mtu'] + json_fields = ['name', 'type', 'mac_address', 'vlan', 'links', 'effective_mtu'] + + def __init__(self, api_client, **kwargs): + super(Interface, self).__init__(api_client, **kwargs) + +class Interfaces(model_base.ResourceCollectionBase): + + collection_url = 'nodes/{system_id}/interfaces/' + collection_resource = Interface + + def __init__(self, api_client, **kwargs): + super(Interfaces, self).__init__(api_client) + self.system_id = kwargs.get('system_id', None) \ No newline at end of file diff --git a/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/drydock_provisioner/drivers/node/maasdriver/models/machine.py new file mode 100644 index 00000000..f4ec3609 --- /dev/null +++ b/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -0,0 +1,185 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import drydock_provisioner.drivers.node.maasdriver.models.base as model_base +import drydock_provisioner.drivers.node.maasdriver.models.interface as maas_interface +import bson +import yaml + +class Machine(model_base.ResourceBase): + + resource_url = 'machines/{resource_id}/' + fields = ['resource_id', 'hostname', 'power_type', 'power_state', 'power_parameters', 'interfaces', + 'boot_interface', 'memory', 'cpu_count', 'tag_names'] + json_fields = ['hostname', 'power_type'] + + def __init__(self, api_client, **kwargs): + super(Machine, self).__init__(api_client, **kwargs) + + # Replace generic dicts with interface collection model + if getattr(self, 'resource_id', None) is not None: + self.interfaces = maas_interface.Interfaces(api_client, system_id=self.resource_id) + self.interfaces.refresh() + + def get_power_params(self): + url = self.interpolate_url() + + resp = self.api_client.get(url, op='power_parameters') + + if resp.status_code == 200: + self.power_parameters = resp.json() + + def commission(self, debug=False): + url = self.interpolate_url() + + # If we want to debug this node commissioning, enable SSH + # after commissioning and leave the node powered up + + options = {'enable_ssh': '1' if debug else '0'} + + resp = self.api_client.post(url, op='commission', files=options) + + # Need to sort out how to handle exceptions + if not resp.ok: + raise Exception() + + def get_details(self): + url = self.interpolate_url() + + resp = self.api_client.get(url, op='details') + + if resp.status_code == 200: + detail_config = bson.loads(resp.text) + return detail_config + + + def to_dict(self): + """ + Serialize this resource instance into a dict matching the + MAAS representation of the resource + """ + data_dict = {} + + for f in self.json_fields: + if getattr(self, f, None) is not None: + if f == 'resource_id': + data_dict['system_id'] = getattr(self, f) + else: + data_dict[f] = getattr(self, f) + + return data_dict + + @classmethod + def from_dict(cls, api_client, obj_dict): + """ + Create a instance of this resource class based on a dict + of MaaS type attributes + + Customized for Machine due to use of system_id instead of id + as resource key + + :param api_client: Instance of api_client.MaasRequestFactory for accessing MaaS API + :param obj_dict: Python dict as parsed from MaaS API JSON representing this resource type + """ + + refined_dict = {k: obj_dict.get(k, None) for k in cls.fields} + + if 'system_id' in obj_dict.keys(): + refined_dict['resource_id'] = obj_dict.get('system_id') + + i = cls(api_client, **refined_dict) + return i + +class Machines(model_base.ResourceCollectionBase): + + collection_url = 'machines/' + collection_resource = Machine + + def __init__(self, api_client, **kwargs): + super(Machines, self).__init__(api_client) + + # Add the OOB power parameters to each machine instance + def collect_power_params(self): + for k, v in self.resources.items(): + v.get_power_params() + + + def identify_baremetal_node(self, node_model, update_name=True): + """ + Search all the defined MaaS Machines and attempt to match + one against the provided Drydock BaremetalNode model. Update + the MaaS instance with the correct hostname + + :param node_model: Instance of objects.node.BaremetalNode to search MaaS for matching resource + :param update_name: Whether Drydock should update the MaaS resource name to match the Drydock design + """ + node_oob_network = node_model.oob_network + node_oob_ip = node_model.get_network_address(node_oob_network) + + if node_oob_ip is None: + self.logger.warn("Node model missing OOB IP address") + raise ValueError('Node model missing OOB IP address') + + try: + self.collect_power_params() + + maas_node = self.singleton({'power_params.power_address': node_oob_ip}) + + self.logger.debug("Found MaaS resource %s matching Node %s" % (maas_node.resource_id, node_model.get_id())) + + if maas_node.hostname != node_model.name and update_name: + maas_node.hostname = node_model.name + maas_node.update() + self.logger.debug("Updated MaaS resource %s hostname to %s" % (maas_node.resource_id, node_model.name)) + return maas_node + + except ValueError as ve: + self.logger.warn("Error locating matching MaaS resource for OOB IP %s" % (node_oob_ip)) + return None + + def query(self, query): + """ + Custom query method to deal with complex fields + """ + result = list(self.resources.values()) + for (k, v) in query.items(): + if k.startswith('power_params.'): + field = k[13:] + result = [i for i in result + if str(getattr(i,'power_parameters', {}).get(field, None)) == str(v)] + else: + result = [i for i in result + if str(getattr(i, k, None)) == str(v)] + + return result + + + def add(self, res): + """ + Create a new resource in this collection in MaaS + + Customize as Machine resources use 'system_id' instead of 'id' + """ + data_dict = res.to_dict() + url = self.interpolate_url() + + resp = self.api_client.post(url, files=data_dict) + + if resp.status_code == 200: + resp_json = resp.json() + res.set_resource_id(resp_json.get('system_id')) + return res + + raise errors.DriverError("Failed updating MAAS url %s - return code %s" + % (url, resp.status_code)) \ No newline at end of file diff --git a/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py b/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py index a4378369..9010fbcf 100644 --- a/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py +++ b/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import time +import logging from pyghmi.ipmi.command import Command @@ -34,15 +35,19 @@ class PyghmiDriver(oob.OobDriver): self.driver_key = "pyghmi_driver" self.driver_desc = "Pyghmi OOB Driver" + self.logger = logging.getLogger('drydock.oobdriver.pyghmi') self.config = config.DrydockConfig.node_driver.get(self.driver_key, {}) def execute_task(self, task_id): task = self.state_manager.get_task(task_id) if task is None: + self.logger.error("Invalid task %s" % (task_id)) raise errors.DriverError("Invalid task %s" % (task_id)) if task.action not in self.supported_actions: + self.logger.error("Driver %s doesn't support task action %s" + % (self.driver_desc, task.action)) raise errors.DriverError("Driver %s doesn't support task action %s" % (self.driver_desc, task.action)) @@ -66,7 +71,7 @@ class PyghmiDriver(oob.OobDriver): result=hd_fields.ActionResult.Success) return - site_design = self.orchestrator.get_effective_site(design_id, task.site_name) + site_design = self.orchestrator.get_effective_site(design_id) target_nodes = [] @@ -118,13 +123,6 @@ class PyghmiDriver(oob.OobDriver): if x.get_result() in [hd_fields.ActionResult.PartialSuccess, hd_fields.ActionResult.Failure]] - print("Task %s successful subtasks: %s" % - (task.get_id(), len(success_subtasks))) - print("Task %s unsuccessful subtasks: %s" % - (task.get_id(), len(nosuccess_subtasks))) - print("Task %s total subtasks: %s" % - (task.get_id(), len(task.get_subtasks()))) - task_result = None if len(success_subtasks) > 0 and len(nosuccess_subtasks) > 0: task_result = hd_fields.ActionResult.PartialSuccess @@ -145,9 +143,11 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): def __init__(self, node=None, **kwargs): super(PyghmiTaskRunner, self).__init__(**kwargs) + self.logger = logging.getLogger('drydock.oobdriver.pyghmi') # We cheat here by providing the Node model instead # of making the runner source it from statemgmt if node is None: + self.logger.error("Did not specify target node") raise errors.DriverError("Did not specify target node") self.node = node @@ -171,8 +171,7 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): raise errors.DriverError("Runner node does not match " \ "task node scope") - - ipmi_network = self.node.applied.get('oob_network') + ipmi_network = self.node.oob_network ipmi_address = self.node.get_network_address(ipmi_network) if ipmi_address is None: @@ -184,8 +183,8 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): self.orchestrator.task_field_update(self.task.get_id(), status=hd_fields.TaskStatus.Running) - ipmi_account = self.node.applied.get('oob_account', '') - ipmi_credential = self.node.applied.get('oob_credential', '') + ipmi_account = self.node.oob_account + ipmi_credential = self.node.oob_credential ipmi_session = Command(bmc=ipmi_address, userid=ipmi_account, password=ipmi_credential) diff --git a/drydock_provisioner/drivers/readme.md b/drydock_provisioner/drivers/readme.md index 0aab4c1c..63214dd7 100644 --- a/drydock_provisioner/drivers/readme.md +++ b/drydock_provisioner/drivers/readme.md @@ -32,6 +32,7 @@ and storage. * CreateStorageTemplate - Configure site-wide storage information in bootstrapper * CreateBootMedia - Ensure all needed boot media is available to the bootstrapper including external repositories * PrepareHardwareConfig - Prepare the bootstrapper to handle all hardware configuration actions (firmware updates, RAID configuration, driver installation) +* IdentifyNode - Correlate a node definition in the Drydock internal model with a node detected by the downstream node bootstrapper. * ConfigureHardware - Update and validate all hardware configurations on a node prior to deploying the OS on it * InterrogateNode - Interrogate the bootstrapper about node information. Depending on the current state of the node, this interrogation will produce different information. * ApplyNodeNetworking - Configure networking for a node diff --git a/drydock_provisioner/drydock.py b/drydock_provisioner/drydock.py index 93e91982..18e53a13 100644 --- a/drydock_provisioner/drydock.py +++ b/drydock_provisioner/drydock.py @@ -28,7 +28,7 @@ def start_drydock(): logger.setLevel(config.DrydockConfig.global_config.get('log_level')) ch = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) diff --git a/drydock_provisioner/ingester/__init__.py b/drydock_provisioner/ingester/__init__.py index 0938f710..daaff310 100644 --- a/drydock_provisioner/ingester/__init__.py +++ b/drydock_provisioner/ingester/__init__.py @@ -83,7 +83,11 @@ class Ingester(object): self.logger.debug("Ingester:ingest_data ingesting design parts for design %s" % design_id) if plugin_name in self.registered_plugins: - design_items = self.registered_plugins[plugin_name].ingest_data(**kwargs) + try: + design_items = self.registered_plugins[plugin_name].ingest_data(**kwargs) + except ValueError as vex: + self.logger.warn("Ingester:ingest_data - Error process data - %s" % (str(vex))) + return None self.logger.debug("Ingester:ingest_data parsed %s design parts" % str(len(design_items))) for m in design_items: if context is not None: diff --git a/drydock_provisioner/ingester/plugins/yaml.py b/drydock_provisioner/ingester/plugins/yaml.py index 37e1ad06..fce62c32 100644 --- a/drydock_provisioner/ingester/plugins/yaml.py +++ b/drydock_provisioner/ingester/plugins/yaml.py @@ -69,7 +69,7 @@ class YamlIngester(IngesterPlugin): """ def parse_docs(self, yaml_string): models = [] - + self.logger.debug("yamlingester:parse_docs - Parsing YAML string \n%s" % (yaml_string)) try: parsed_data = yaml.load_all(yaml_string) except yaml.YAMLError as err: diff --git a/drydock_provisioner/objects/fields.py b/drydock_provisioner/objects/fields.py index c6ac8ac3..2abb6098 100644 --- a/drydock_provisioner/objects/fields.py +++ b/drydock_provisioner/objects/fields.py @@ -44,6 +44,7 @@ class OrchestratorAction(BaseDrydockEnum): CreateStorageTemplate = 'create_storage_template' CreateBootMedia = 'create_boot_media' PrepareHardwareConfig = 'prepare_hardware_config' + IdentifyNode = 'identify_node' ConfigureHardware = 'configure_hardware' InterrogateNode = 'interrogate_node' ApplyNodeNetworking = 'apply_node_networking' diff --git a/drydock_provisioner/orchestrator/__init__.py b/drydock_provisioner/orchestrator/__init__.py index 3736faa2..c6c50882 100644 --- a/drydock_provisioner/orchestrator/__init__.py +++ b/drydock_provisioner/orchestrator/__init__.py @@ -16,6 +16,7 @@ import uuid import time import threading import importlib +import logging from copy import deepcopy @@ -32,6 +33,7 @@ class Orchestrator(object): self.enabled_drivers = {} self.state_manager = state_manager + self.logger = logging.getLogger('drydock.orchestrator') if enabled_drivers is not None: oob_driver_name = enabled_drivers.get('oob', None) @@ -155,10 +157,14 @@ class Orchestrator(object): task_scope=task_scope, action=hd_fields.OrchestratorAction.CreateNetworkTemplate) + self.logger.info("Starting node driver task %s to create network templates" % (driver_task.get_id())) + driver.execute_task(driver_task.get_id()) driver_task = self.state_manager.get_task(driver_task.get_id()) + self.logger.info("Node driver task %s complete" % (driver_task.get_id())) + self.task_field_update(task_id, status=hd_fields.TaskStatus.Complete, result=driver_task.get_result()) @@ -166,13 +172,13 @@ class Orchestrator(object): elif task.action == hd_fields.OrchestratorAction.VerifyNode: self.task_field_update(task_id, status=hd_fields.TaskStatus.Running) + oob_driver = self.enabled_drivers['oob'] - driver = self.enabled_drivers['oob'] - - if driver is None: + if oob_driver is None: self.task_field_update(task_id, status=hd_fields.TaskStatus.Errored, - result=hd_fields.ActionResult.Failure) + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No oob driver configured', 'retry': False}) return site_design = self.get_effective_site(design_id) @@ -186,30 +192,42 @@ class Orchestrator(object): task_scope = {'site' : task_site, 'node_names' : target_names} - driver_task = self.create_task(tasks.DriverTask, + oob_driver_task = self.create_task(tasks.DriverTask, parent_task_id=task.get_id(), design_id=design_id, - action=hd_fields.OrchestratorAction.InterrogateNode, + action=hd_fields.OrchestratorAction.InterrogateOob, task_scope=task_scope) - driver.execute_task(driver_task.get_id()) + oob_driver.execute_task(oob_driver_task.get_id()) - driver_task = self.state_manager.get_task(driver_task.get_id()) + oob_driver_task = self.state_manager.get_task(oob_driver_task.get_id()) self.task_field_update(task_id, status=hd_fields.TaskStatus.Complete, - result=driver_task.get_result()) + result=oob_driver_task.get_result()) return elif task.action == hd_fields.OrchestratorAction.PrepareNode: + failed = worked = False + self.task_field_update(task_id, status=hd_fields.TaskStatus.Running) - driver = self.enabled_drivers['oob'] + oob_driver = self.enabled_drivers['oob'] - if driver is None: + if oob_driver is None: self.task_field_update(task_id, status=hd_fields.TaskStatus.Errored, - result=hd_fields.ActionResult.Failure) + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No oob driver configured', 'retry': False}) + return + + node_driver = self.enabled_drivers['node'] + + if node_driver is None: + self.task_field_update(task_id, + status=hd_fields.TaskStatus.Errored, + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No node driver configured', 'retry': False}) return site_design = self.get_effective_site(design_id) @@ -228,34 +246,89 @@ class Orchestrator(object): design_id=design_id, action=hd_fields.OrchestratorAction.SetNodeBoot, task_scope=task_scope) + + self.logger.info("Starting OOB driver task %s to set PXE boot" % (setboot_task.get_id())) - driver.execute_task(setboot_task.get_id()) + oob_driver.execute_task(setboot_task.get_id()) + + self.logger.info("OOB driver task %s complete" % (setboot_task.get_id())) setboot_task = self.state_manager.get_task(setboot_task.get_id()) + if setboot_task.get_result() == hd_fields.ActionResult.Success: + worked = True + elif setboot_task.get_result() == hd_fields.ActionResult.PartialSuccess: + worked = failed = True + elif setboot_task.get_result() == hd_fields.ActionResult.Failure: + failed = True + cycle_task = self.create_task(tasks.DriverTask, parent_task_id=task.get_id(), design_id=design_id, action=hd_fields.OrchestratorAction.PowerCycleNode, task_scope=task_scope) - driver.execute_task(cycle_task.get_id()) + + self.logger.info("Starting OOB driver task %s to power cycle nodes" % (cycle_task.get_id())) + + oob_driver.execute_task(cycle_task.get_id()) + + self.logger.info("OOB driver task %s complete" % (cycle_task.get_id())) cycle_task = self.state_manager.get_task(cycle_task.get_id()) - if (setboot_task.get_result() == hd_fields.ActionResult.Success and - cycle_task.get_result() == hd_fields.ActionResult.Success): - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.Success) - elif (setboot_task.get_result() == hd_fields.ActionResult.Success or - cycle_task.get_result() == hd_fields.ActionResult.Success): - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.PartialSuccess) + if cycle_task.get_result() == hd_fields.ActionResult.Success: + worked = True + elif cycle_task.get_result() == hd_fields.ActionResult.PartialSuccess: + worked = failed = True + elif cycle_task.get_result() == hd_fields.ActionResult.Failure: + failed = True + + + # IdentifyNode success will take some time after PowerCycleNode finishes + # Retry the operation a few times if it fails before considering it a final failure + # Each attempt is a new task which might make the final task tree a bit confusing + + node_identify_attempts = 0 + + while True: + + node_identify_task = self.create_task(tasks.DriverTask, + parent_task_id=task.get_id(), + design_id=design_id, + action=hd_fields.OrchestratorAction.IdentifyNode, + task_scope=task_scope) + + self.logger.info("Starting node driver task %s to identify node - attempt %s" % + (node_identify_task.get_id(), node_identify_attempts+1)) + + node_driver.execute_task(node_identify_task.get_id()) + node_identify_attempts = node_identify_attempts + 1 + + node_identify_task = self.state_manager.get_task(node_identify_task.get_id()) + + if node_identify_task.get_result() == hd_fields.ActionResult.Success: + worked = True + break + elif node_identify_task.get_result() in [hd_fields.ActionResult.PartialSuccess, + hd_fields.ActionResult.Failure]: + # TODO This threshold should be a configurable default and tunable by task API + if node_identify_attempts > 2: + failed = True + break + + time.sleep(5 * 60) + + final_result = None + if worked and failed: + final_result = hd_fields.ActionResult.PartialSuccess + elif worked: + final_result = hd_fields.ActionResult.Success else: - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.Failure) + final_result = hd_fields.ActionResult.Failure + + self.task_field_update(task_id, + status=hd_fields.TaskStatus.Complete, + result=final_result) return else: diff --git a/setup.py b/setup.py index dc97cc33..b090c19c 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ setup(name='drydock_provisioner', 'requests', 'oauthlib', 'uwsgi>1.4', + 'bson===0.4.7' ] )