Add implement of preventing split-brain
This patch adds implementation of preventing split-brain. The part that captures packets of corosync is not impelemented yet. Change-Id: I6bedd28928ac53dfa30b889d2442d748edad2f9c Implements: bp pythonize-host-and-process-monitor
This commit is contained in:
@@ -49,6 +49,33 @@ If ipmi RA is not set in pacemaker, this value should be set True.
|
|||||||
cfg.IntOpt('ipmi_retry_interval',
|
cfg.IntOpt('ipmi_retry_interval',
|
||||||
default=10,
|
default=10,
|
||||||
help='Retry interval(in seconds) of the ipmitool command.'),
|
help='Retry interval(in seconds) of the ipmitool command.'),
|
||||||
|
cfg.IntOpt('stonith_wait',
|
||||||
|
default=30,
|
||||||
|
help='Standby time(in seconds) until activate STONITH.'),
|
||||||
|
cfg.IntOpt('tcpdump_timeout',
|
||||||
|
default=5,
|
||||||
|
help='Timeout value(in seconds) of the tcpdump command when'
|
||||||
|
' monitors the corosync communication.'),
|
||||||
|
cfg.StrOpt('corosync_multicast_interfaces',
|
||||||
|
help='''
|
||||||
|
The name of interface that corosync is using for mutual communication
|
||||||
|
between hosts.
|
||||||
|
If there are multiple interfaces, specify them in comma-separated
|
||||||
|
like 'enp0s3,enp0s8'.
|
||||||
|
The number of interfaces you specify must be equal to the number of
|
||||||
|
corosync_multicast_ports values and must be in correct order with relevant
|
||||||
|
ports in corosync_multicast_ports.
|
||||||
|
'''),
|
||||||
|
cfg.StrOpt('corosync_multicast_ports',
|
||||||
|
help='''
|
||||||
|
The port numbers that corosync is using for mutual communication
|
||||||
|
between hosts.
|
||||||
|
If there are multiple port numbers, specify them in comma-separated
|
||||||
|
like '5405,5406'.
|
||||||
|
The number of port numbers you specify must be equal to the number of
|
||||||
|
corosync_multicast_interfaces values and must be in correct order with
|
||||||
|
relevant interfaces in corosync_multicast_interfaces.
|
||||||
|
'''),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@@ -46,6 +46,92 @@ class HandleHost(driver.DriverBase):
|
|||||||
self.status_holder = hold_host_status.HostHoldStatus()
|
self.status_holder = hold_host_status.HostHoldStatus()
|
||||||
self.notifier = masakari.SendNotification()
|
self.notifier = masakari.SendNotification()
|
||||||
|
|
||||||
|
def _check_pacemaker_services(self, target_service):
|
||||||
|
try:
|
||||||
|
cmd_str = 'systemctl status ' + target_service
|
||||||
|
command = cmd_str.split(' ')
|
||||||
|
|
||||||
|
# Execute command.
|
||||||
|
out, err = utils.execute(*command, run_as_root=True)
|
||||||
|
|
||||||
|
if err:
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _check_hb_line(self):
|
||||||
|
"""Check whether the corosync communication is normal.
|
||||||
|
|
||||||
|
:returns: 0 if normal, 1 if abnormal, 2 if configuration file is
|
||||||
|
wrong or neither pacemaker nor pacemaker-remote is running.
|
||||||
|
"""
|
||||||
|
# Check whether the pacemaker services is normal.
|
||||||
|
corosync_status = self._check_pacemaker_services('corosync')
|
||||||
|
pacemaker_status = self._check_pacemaker_services('pacemaker')
|
||||||
|
pacemaker_remote_status = self._check_pacemaker_services(
|
||||||
|
'pacemaker_remote')
|
||||||
|
|
||||||
|
if corosync_status is False or pacemaker_status is False:
|
||||||
|
if pacemaker_remote_status is False:
|
||||||
|
LOG.error(
|
||||||
|
_LE("Neither pacemaker nor pacemaker-remote is running."))
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
LOG.info(_LI("Works on pacemaker-remote."))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Check whether the neccesary parameters are set.
|
||||||
|
if CONF.host.corosync_multicast_interfaces is None or \
|
||||||
|
CONF.host.corosync_multicast_ports is None:
|
||||||
|
msg = ("corosync_multicast_interfaces or "
|
||||||
|
"corosync_multicast_ports is not set.")
|
||||||
|
LOG.error(_LE("%s"), msg)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
# Check whether the corosync communication is normal.
|
||||||
|
corosync_multicast_interfaces = \
|
||||||
|
CONF.host.corosync_multicast_interfaces.split(',')
|
||||||
|
corosync_multicast_ports = \
|
||||||
|
CONF.host.corosync_multicast_ports.split(',')
|
||||||
|
|
||||||
|
if len(corosync_multicast_interfaces) != len(corosync_multicast_ports):
|
||||||
|
msg = ("Incorrect parameters corosync_multicast_interfaces or "
|
||||||
|
"corosync_multicast_ports.")
|
||||||
|
LOG.error(_LE("%s"), msg)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
is_nic_normal = False
|
||||||
|
for num in range(0, len(corosync_multicast_interfaces)):
|
||||||
|
cmd_str = ("timeout %s tcpdump -n -c 1 -p -i %s port %s") \
|
||||||
|
% (CONF.host.tcpdump_timeout,
|
||||||
|
corosync_multicast_interfaces[num],
|
||||||
|
corosync_multicast_ports[num])
|
||||||
|
command = cmd_str.split(' ')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Execute crmadmin command.
|
||||||
|
out, err = utils.execute(*command, run_as_root=True)
|
||||||
|
|
||||||
|
# If command doesn't raise exception, nic is normal.
|
||||||
|
msg = ("Corosync communication using '%s' is normal.") \
|
||||||
|
% corosync_multicast_interfaces[num]
|
||||||
|
LOG.info(_LI("%s"), msg)
|
||||||
|
is_nic_normal = True
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
msg = ("Corosync communication using '%s' is failed.") \
|
||||||
|
% corosync_multicast_interfaces[num]
|
||||||
|
LOG.warning(_LW("%s"), msg)
|
||||||
|
|
||||||
|
if is_nic_normal is False:
|
||||||
|
LOG.error(_LE("Corosync communication is failed."))
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
def _check_host_status_by_crmadmin(self):
|
def _check_host_status_by_crmadmin(self):
|
||||||
try:
|
try:
|
||||||
# Execute crmadmin command.
|
# Execute crmadmin command.
|
||||||
@@ -256,12 +342,30 @@ class HandleHost(driver.DriverBase):
|
|||||||
self.running = True
|
self.running = True
|
||||||
while self.running:
|
while self.running:
|
||||||
|
|
||||||
# Check the host status is stable or unstable by crmadmin.
|
# Check whether corosync communication between hosts
|
||||||
if self._check_host_status_by_crmadmin() != 0:
|
# is normal.
|
||||||
|
ret = self._check_hb_line()
|
||||||
|
if ret == 1:
|
||||||
|
# Because my host may be fenced by stonith due to split
|
||||||
|
# brain condition, sleep for a certain time.
|
||||||
|
eventlet.greenthread.sleep(CONF.host.stonith_wait)
|
||||||
|
elif ret == 2:
|
||||||
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Check the host status is stable or unstable by crmadmin.
|
||||||
|
# It only checks when this process runs on the full cluster
|
||||||
|
# stack of corosync.
|
||||||
|
pacemaker_remote_status = self._check_pacemaker_services(
|
||||||
|
'pacemaker_remote')
|
||||||
|
if pacemaker_remote_status is False:
|
||||||
|
if self._check_host_status_by_crmadmin() != 0:
|
||||||
|
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||||
|
eventlet.greenthread.sleep(
|
||||||
|
CONF.host.monitoring_interval)
|
||||||
|
continue
|
||||||
|
|
||||||
# Check the host status is online or offline by cibadmin.
|
# Check the host status is online or offline by cibadmin.
|
||||||
if self._check_host_status_by_cibadmin() != 0:
|
if self._check_host_status_by_cibadmin() != 0:
|
||||||
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||||
|
@@ -35,7 +35,11 @@ class TestHandleHost(testtools.TestCase):
|
|||||||
@mock.patch.object(parse_cib_xml.ParseCibXml, 'have_quorum')
|
@mock.patch.object(parse_cib_xml.ParseCibXml, 'have_quorum')
|
||||||
@mock.patch.object(parse_cib_xml.ParseCibXml, 'set_cib_xml')
|
@mock.patch.object(parse_cib_xml.ParseCibXml, 'set_cib_xml')
|
||||||
@mock.patch.object(utils, 'execute')
|
@mock.patch.object(utils, 'execute')
|
||||||
|
@mock.patch.object(handle_host.HandleHost, '_check_pacemaker_services')
|
||||||
|
@mock.patch.object(handle_host.HandleHost, '_check_hb_line')
|
||||||
def test_monitor_hosts(self,
|
def test_monitor_hosts(self,
|
||||||
|
mock_check_hb_line,
|
||||||
|
mock_check_pacemaker_services,
|
||||||
mock_execute,
|
mock_execute,
|
||||||
mock_set_cib_xml,
|
mock_set_cib_xml,
|
||||||
mock_have_quorum,
|
mock_have_quorum,
|
||||||
@@ -43,6 +47,8 @@ class TestHandleHost(testtools.TestCase):
|
|||||||
|
|
||||||
obj = handle_host.HandleHost()
|
obj = handle_host.HandleHost()
|
||||||
|
|
||||||
|
mock_check_hb_line.return_value = 0
|
||||||
|
mock_check_pacemaker_services.return_value = False
|
||||||
mock_execute.return_value = (EXECUTE_RETURN, '')
|
mock_execute.return_value = (EXECUTE_RETURN, '')
|
||||||
mock_set_cib_xml.return_value = None
|
mock_set_cib_xml.return_value = None
|
||||||
mock_have_quorum.return_value = 0
|
mock_have_quorum.return_value = 0
|
||||||
@@ -50,3 +56,14 @@ class TestHandleHost(testtools.TestCase):
|
|||||||
|
|
||||||
ret = obj.monitor_hosts()
|
ret = obj.monitor_hosts()
|
||||||
self.assertEqual(None, ret)
|
self.assertEqual(None, ret)
|
||||||
|
|
||||||
|
@mock.patch.object(utils, 'execute')
|
||||||
|
def test_check_hb_line(self,
|
||||||
|
mock_execute):
|
||||||
|
|
||||||
|
obj = handle_host.HandleHost()
|
||||||
|
|
||||||
|
mock_execute.return_value = ('', '')
|
||||||
|
|
||||||
|
ret = obj._check_hb_line()
|
||||||
|
self.assertEqual(2, ret)
|
||||||
|
Reference in New Issue
Block a user