From f35c63df24d8f47927d055729e9ca08c7bc3fe8c Mon Sep 17 00:00:00 2001
From: John Garbutt <john@johngarbutt.com>
Date: Sat, 9 Nov 2013 20:08:37 +0800
Subject: [PATCH] Make error notifications more consistent.

The error notifications sent by wrap_exception are a good source of
information when tracking down errors in the system. It is easy to spot
patterns in the errors by looking at similarities in the exceptions
raised.

This change looks at making the in-line error notifications
follow the same format, so they become equally useful.

Closes-Bug: #1251602
Change-Id: I1cd64a90ecefc64d12af05487eb6e45aa0362d69
---
 nova/compute/manager.py                | 91 +++++++++++++-------------
 nova/compute/utils.py                  | 44 ++++++++++---
 nova/tests/compute/test_compute.py     | 14 ++--
 nova/tests/compute/test_compute_mgr.py | 15 ++---
 4 files changed, 94 insertions(+), 70 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 83523d786050..e3b07384fd14 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -958,13 +958,13 @@ class ComputeManager(manager.Manager):
 
         extra_usage_info = {}
 
-        def notify(status, msg="", **kwargs):
+        def notify(status, msg="", fault=None, **kwargs):
             """Send a create.{start,error,end} notification."""
             type_ = "create.%(status)s" % dict(status=status)
             info = extra_usage_info.copy()
             info['message'] = unicode(msg)
             self._notify_about_instance_usage(context, instance, type_,
-                    extra_usage_info=info, **kwargs)
+                    extra_usage_info=info, fault=fault, **kwargs)
 
         try:
             self._prebuild_instance(context, instance)
@@ -985,7 +985,7 @@ class ComputeManager(manager.Manager):
 
         except exception.RescheduledException as e:
             # Instance build encountered an error, and has been rescheduled.
-            notify("error", msg=unicode(e))  # notify that build failed
+            notify("error", fault=e)
 
         except exception.BuildAbortException as e:
             # Instance build aborted due to a non-failure
@@ -996,7 +996,7 @@ class ComputeManager(manager.Manager):
             # Instance build encountered a non-recoverable error:
             with excutils.save_and_reraise_exception():
                 self._set_instance_error_state(context, instance['uuid'])
-                notify("error", msg=unicode(e))  # notify that build failed
+                notify("error", fault=e)  # notify that build failed
 
     def _prebuild_instance(self, context, instance):
         self._check_instance_exists(context, instance)
@@ -1128,6 +1128,8 @@ class ComputeManager(manager.Manager):
 
         compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
                 instance, exc_info[1], exc_info=exc_info)
+        self._notify_about_instance_usage(context, instance,
+                'instance.create.error', fault=exc_info[1])
 
         try:
             LOG.debug(_("Clean up resource before rescheduling."),
@@ -1515,12 +1517,12 @@ class ComputeManager(manager.Manager):
 
     def _notify_about_instance_usage(self, context, instance, event_suffix,
                                      network_info=None, system_metadata=None,
-                                     extra_usage_info=None):
+                                     extra_usage_info=None, fault=None):
         compute_utils.notify_about_instance_usage(
             self.notifier, context, instance, event_suffix,
             network_info=network_info,
             system_metadata=system_metadata,
-            extra_usage_info=extra_usage_info)
+            extra_usage_info=extra_usage_info, fault=fault)
 
     def _deallocate_network(self, context, instance,
                             requested_networks=None):
@@ -1697,29 +1699,22 @@ class ComputeManager(manager.Manager):
                             'create.end',
                             extra_usage_info={'message': _('Success')},
                             network_info=network_info)
-        except exception.InstanceNotFound as e:
+        except (exception.InstanceNotFound,
+                exception.UnexpectedDeletingTaskStateError) as e:
             with excutils.save_and_reraise_exception():
                 self._notify_about_instance_usage(context, instance,
-                        'create.end',
-                        extra_usage_info={'message': e.format_message()})
-        except exception.UnexpectedDeletingTaskStateError as e:
-            with excutils.save_and_reraise_exception():
-                msg = e.format_message()
-                self._notify_about_instance_usage(context, instance,
-                        'create.end', extra_usage_info={'message': msg})
+                    'create.end', fault=e)
         except exception.ComputeResourcesUnavailable as e:
             LOG.debug(e.format_message(), instance=instance)
             self._notify_about_instance_usage(context, instance,
-                    'create.error',
-                    extra_usage_info={'message': e.format_message()})
+                    'create.error', fault=e)
             raise exception.RescheduledException(
                     instance_uuid=instance.uuid, reason=e.format_message())
         except exception.BuildAbortException as e:
             with excutils.save_and_reraise_exception():
-                LOG.debug(e.format_message, instance=instance)
+                LOG.debug(e.format_message(), instance=instance)
                 self._notify_about_instance_usage(context, instance,
-                        'create.error', extra_usage_info={'message':
-                                                        e.format_message()})
+                    'create.error', fault=e)
         except (exception.VirtualInterfaceCreateException,
                 exception.VirtualInterfaceMacAddressException,
                 exception.FixedIpLimitExceeded,
@@ -1727,15 +1722,13 @@ class ComputeManager(manager.Manager):
             LOG.exception(_('Failed to allocate network(s)'),
                           instance=instance)
             self._notify_about_instance_usage(context, instance,
-                    'create.error', extra_usage_info={'message':
-                        e.format_message()})
+                    'create.error', fault=e)
             msg = _('Failed to allocate the network(s), not rescheduling.')
             raise exception.BuildAbortException(instance_uuid=instance.uuid,
                     reason=msg)
         except Exception as e:
             self._notify_about_instance_usage(context, instance,
-                    'create.error',
-                    extra_usage_info={'message': str(e)})
+                    'create.error', fault=e)
             raise exception.RescheduledException(
                     instance_uuid=instance.uuid, reason=str(e))
 
@@ -2400,25 +2393,23 @@ class ComputeManager(manager.Manager):
                                bad_volumes_callback=bad_volumes_callback)
 
         except Exception as error:
-            # Can't use save_and_reraise as we don't know yet if we
-            # will re-raise or not
-            type_, value, tb = sys.exc_info()
-
-            compute_utils.add_instance_fault_from_exc(context,
-                            self.conductor_api, instance, error,
-                            sys.exc_info())
-
-            # if the reboot failed but the VM is running don't
-            # put it into an error state
-            new_power_state = self._get_power_state(context, instance)
-            if new_power_state == power_state.RUNNING:
-                LOG.warning(_('Reboot failed but instance is running'),
-                            context=context, instance=instance)
-            else:
-                LOG.error(_('Cannot reboot instance: %s'), error,
-                          context=context, instance=instance)
-                self._set_instance_obj_error_state(context, instance)
-                raise type_, value, tb
+            with excutils.save_and_reraise_exception() as ctxt:
+                exc_info = sys.exc_info()
+                # if the reboot failed but the VM is running don't
+                # put it into an error state
+                new_power_state = self._get_power_state(context, instance)
+                if new_power_state == power_state.RUNNING:
+                    LOG.warning(_('Reboot failed but instance is running'),
+                                context=context, instance=instance)
+                    compute_utils.add_instance_fault_from_exc(context,
+                            self.conductor_api, instance, error, exc_info)
+                    self._notify_about_instance_usage(context, instance,
+                            'reboot.error', fault=error)
+                    ctxt.reraise = False
+                else:
+                    LOG.error(_('Cannot reboot instance: %s'), error,
+                              context=context, instance=instance)
+                    self._set_instance_obj_error_state(context, instance)
 
         if not new_power_state:
             new_power_state = self._get_power_state(context, instance)
@@ -3127,9 +3118,6 @@ class ComputeManager(manager.Manager):
         rescheduled = False
         instance_uuid = instance['uuid']
 
-        compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
-                instance, exc_info[0], exc_info=exc_info)
-
         try:
             # NOTE(comstud): remove the scheduler RPCAPI method when
             # this is adjusted to send to conductor... and then
@@ -3143,14 +3131,23 @@ class ComputeManager(manager.Manager):
             rescheduled = self._reschedule(context, request_spec,
                     filter_properties, instance_uuid, scheduler_method,
                     method_args, task_state, exc_info)
-        except Exception:
+        except Exception as error:
             rescheduled = False
             LOG.exception(_("Error trying to reschedule"),
                           instance_uuid=instance_uuid)
+            compute_utils.add_instance_fault_from_exc(context,
+                    self.conductor_api, instance, error,
+                    exc_info=sys.exc_info())
+            self._notify_about_instance_usage(context, instance,
+                    'resize.error', fault=error)
 
         if rescheduled:
-            # log the original build error
             self._log_original_error(exc_info, instance_uuid)
+            compute_utils.add_instance_fault_from_exc(context,
+                    self.conductor_api, instance, exc_info[1],
+                    exc_info=exc_info)
+            self._notify_about_instance_usage(context, instance,
+                    'resize.error', fault=exc_info[1])
         else:
             # not re-scheduling
             raise exc_info[0], exc_info[1], exc_info[2]
diff --git a/nova/compute/utils.py b/nova/compute/utils.py
index 43022e3b2c3b..796206dee58f 100644
--- a/nova/compute/utils.py
+++ b/nova/compute/utils.py
@@ -39,12 +39,11 @@ CONF.import_opt('host', 'nova.netconf')
 LOG = log.getLogger(__name__)
 
 
-def add_instance_fault_from_exc(context, conductor,
-                                instance, fault, exc_info=None):
-    """Adds the specified fault to the database."""
+def exception_to_dict(fault):
+    """Converts exceptions to a dict for use in notifications."""
+    #TODO(johngarbutt) move to nova/exception.py to share with wrap_exception
 
     code = 500
-
     if hasattr(fault, "kwargs"):
         code = fault.kwargs.get('code', 500)
 
@@ -65,17 +64,35 @@ def add_instance_fault_from_exc(context, conductor,
     # MySQL silently truncates overly long messages, but PostgreSQL throws an
     # error if we don't truncate it.
     u_message = unicode(message)[:255]
-    details = ''
 
-    if exc_info and code == 500:
+    fault_dict = dict(exception=fault)
+    fault_dict["message"] = u_message
+    fault_dict["code"] = code
+    return fault_dict
+
+
+def _get_fault_details(exc_info, error_code):
+    details = ''
+    if exc_info and error_code == 500:
         tb = exc_info[2]
-        details += ''.join(traceback.format_tb(tb))
+        if tb:
+            details = ''.join(traceback.format_tb(tb))
+    return unicode(details)
+
+
+def add_instance_fault_from_exc(context, conductor,
+                                instance, fault, exc_info=None):
+    """Adds the specified fault to the database."""
+
+    fault_dict = exception_to_dict(fault)
+    code = fault_dict["code"]
+    details = _get_fault_details(exc_info, code)
 
     values = {
         'instance_uuid': instance['uuid'],
         'code': code,
-        'message': u_message,
-        'details': unicode(details),
+        'message': fault_dict["message"],
+        'details': details,
         'host': CONF.host
     }
     conductor.instance_fault_create(context, values)
@@ -294,7 +311,7 @@ def notify_usage_exists(notifier, context, instance_ref, current_period=False,
 
 def notify_about_instance_usage(notifier, context, instance, event_suffix,
                                 network_info=None, system_metadata=None,
-                                extra_usage_info=None):
+                                extra_usage_info=None, fault=None):
     """Send a notification about an instance.
 
     :param notifier: a messaging.Notifier
@@ -311,6 +328,13 @@ def notify_about_instance_usage(notifier, context, instance, event_suffix,
     usage_info = notifications.info_from_instance(context, instance,
             network_info, system_metadata, **extra_usage_info)
 
+    if fault:
+        # NOTE(johngarbutt) mirrors the format in wrap_exception
+        fault_payload = exception_to_dict(fault)
+        LOG.debug(fault_payload["message"], instance=instance,
+                  exc_info=True)
+        usage_info.update(fault_payload)
+
     if event_suffix.endswith("error"):
         method = notifier.error
     else:
diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py
index 2ed80e056565..61bb924bdff2 100644
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -2337,6 +2337,7 @@ class ComputeTestCase(BaseTestCase):
             'args': (econtext, instance, expected_nw_info,
                      reboot_type),
             'kwargs': {'block_device_info': fake_block_dev_info}}
+        fault = exception.InstanceNotFound(instance_id='instance-0000')
 
         def fake_reboot(*args, **kwargs):
             reboot_call_info['args'] = args
@@ -2347,7 +2348,7 @@ class ComputeTestCase(BaseTestCase):
             # can't stub it out, thus we skip that comparison.
             kwargs.pop('bad_volumes_callback')
             if fail_reboot:
-                raise exception.InstanceNotFound(instance_id='instance-0000')
+                raise fault
 
         self.stubs.Set(self.compute.driver, 'reboot', fake_reboot)
 
@@ -2362,6 +2363,8 @@ class ComputeTestCase(BaseTestCase):
                     instance).AndReturn(fake_power_state3)
 
         if test_delete:
+            fault = exception.InstanceNotFound(
+                        instance_id=instance['uuid'])
             db.instance_update_and_get_original(
                 econtext, updated_dbinstance1['uuid'],
                 {'power_state': new_power_state,
@@ -2369,8 +2372,7 @@ class ComputeTestCase(BaseTestCase):
                  'vm_state': vm_states.ACTIVE},
                 update_cells=False,
                 columns_to_join=['system_metadata'],
-                ).AndRaise(exception.InstanceNotFound(
-                    instance_id=instance['uuid']))
+                ).AndRaise(fault)
             self.compute._notify_about_instance_usage(
                 econtext,
                 instance,
@@ -2381,8 +2383,7 @@ class ComputeTestCase(BaseTestCase):
                 {'vm_state': vm_states.ERROR},
                 update_cells=False,
                 columns_to_join=['system_metadata'],
-                ).AndRaise(exception.InstanceNotFound(
-                    instance_id=instance['uuid']))
+                ).AndRaise(fault)
         else:
             db.instance_update_and_get_original(
                 econtext, updated_dbinstance1['uuid'],
@@ -2392,6 +2393,9 @@ class ComputeTestCase(BaseTestCase):
                 update_cells=False,
                 columns_to_join=['system_metadata'],
                 ).AndReturn((None, updated_dbinstance2))
+            if fail_running:
+                self.compute._notify_about_instance_usage(econtext, instance,
+                        'reboot.error', fault=fault)
             self.compute._notify_about_instance_usage(
                 econtext,
                 instance,
diff --git a/nova/tests/compute/test_compute_mgr.py b/nova/tests/compute/test_compute_mgr.py
index 07489eda8c85..7e0ffdf30938 100644
--- a/nova/tests/compute/test_compute_mgr.py
+++ b/nova/tests/compute/test_compute_mgr.py
@@ -1138,7 +1138,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                 network_info=self.network_info,
                 block_device_info=self.block_device_info).AndRaise(exc)
         self._notify_about_instance_usage('create.end',
-                extra_usage_info={'message': exc.format_message()}, stub=False)
+                fault=exc, stub=False)
         conductor_rpcapi.ConductorAPI.instance_update(
             self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
         self.mox.ReplayAll()
@@ -1161,16 +1161,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         self._notify_about_instance_usage('create.start',
             extra_usage_info={'image_name': self.image.get('name')})
         self._build_and_run_instance_update()
+        exc = test.TestingException()
         self.compute.driver.spawn(self.context, self.instance, self.image,
                 self.injected_files, self.admin_pass,
                 network_info=self.network_info,
-                block_device_info=self.block_device_info).AndRaise(
-                        test.TestingException())
+                block_device_info=self.block_device_info).AndRaise(exc)
         conductor_rpcapi.ConductorAPI.instance_update(
             self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
         self._notify_about_instance_usage('create.error',
-            extra_usage_info={'message': str(test.TestingException())},
-            stub=False)
+            fault=exc, stub=False)
         self.mox.ReplayAll()
 
         self.assertRaises(exception.RescheduledException,
@@ -1214,7 +1213,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                 mock.call(self.context, self.instance, 'create.start',
                     extra_usage_info={'image_name': self.image.get('name')}),
                 mock.call(self.context, self.instance, 'create.error',
-                    extra_usage_info={'message': exc.format_message()})])
+                    fault=exc)])
 
             save.assert_has_calls([
                 mock.call(),
@@ -1249,7 +1248,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         self._notify_about_instance_usage('create.start',
             extra_usage_info={'image_name': self.image.get('name')})
         self._notify_about_instance_usage('create.error',
-            extra_usage_info={'message': exc.format_message()}, stub=False)
+            fault=exc, stub=False)
         self.compute.compute_task_api.build_instances(self.context,
                 [self.instance], self.image, [], self.admin_pass,
                 self.injected_files, self.requested_networks,
@@ -1283,7 +1282,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                 self.requested_networks, self.security_groups, self.image,
                 self.block_device_mapping).AndRaise(exc)
         self._notify_about_instance_usage('create.error',
-            extra_usage_info={'message': exc.format_message()}, stub=False)
+            fault=exc, stub=False)
         self.mox.ReplayAll()
         self.assertRaises(exception.BuildAbortException,
                 self.compute._build_and_run_instance, self.context,