Merge "Fix ceph services stopped when controller-1 is down in DX-Direct"

This commit is contained in:
Zuul
2025-09-25 18:35:35 +00:00
committed by Gerrit Code Review
2 changed files with 64 additions and 25 deletions

View File

@@ -99,6 +99,28 @@ else
IFS=" " read -r -a args <<< "$@"
fi
# Log Management
# Adding PID and PPID informations
log () {
local name=""
local log_level="$1"
# Checking if the first parameter is not a log level
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
name=" ($1)";
log_level="$2"
shift
fi
shift
local message="$@"
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-init-wrapper <prefix> <log_level>: <message>
wlog "${prefix}" "${log_level}" "${message}"
return 0
}
is_ppid_sm()
{
local ppid_name
@@ -156,28 +178,6 @@ if is_ppid_sm && [ "${SM_CEPH_OSD_CURRENT_STATE}" != "${STATE_RUNNING}" ]; then
fi
fi
# Log Management
# Adding PID and PPID informations
log () {
local name=""
local log_level="$1"
# Checking if the first parameter is not a log level
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
name=" ($1)";
log_level="$2"
shift
fi
shift
local message="$@"
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-init-wrapper <prefix> <log_level>: <message>
wlog "${prefix}" "${log_level}" "${message}"
return 0
}
# Identify the ceph network interface from /etc/platform/platform.conf file
# The network interface will be set to the 'ceph_network_interface' variable
# Return 0 if found the variable, and 1 if not.

View File

@@ -118,6 +118,29 @@ has_ceph_network_carrier()
return 0
}
# Verify if oam, cluster host and mgmt networks have carrier.
# This is a special condition for AIO-DX Direct setup.
# If all networks have no carrier, then the other host is down.
# When the other host is down, ceph must start on this host.
# Return 0 if no carrier is detected on all network interfaces.
# Return 1 of carrier has been detected in at lease one network interface.
has_all_network_no_carrier()
{
ip link show "${oam_interface}" | grep NO-CARRIER
local oam_carrier=$?
ip link show "${cluster_host_interface}" | grep NO-CARRIER
local cluster_host_carrier=$?
ip link show "${management_interface}" | grep NO-CARRIER
local mgmt_carrier=$?
# Check if all networks have no carrier, meaning the other host is down
if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then
log INFO "No carrier detected from all network interfaces"
return 0
fi
return 1
}
status()
{
has_ceph_network_carrier
@@ -127,9 +150,25 @@ status()
# Service is "running" and has carrier.
RETVAL=0
else
# Force stop services only if carrier is not detected.
[ ${HAS_CARRIER} -ne 0 ] && stop
RETVAL=1
if [ ${HAS_CARRIER} -ne 0 ]; then
if [ "${system_mode}" == "duplex-direct" ]; then
has_all_network_no_carrier
if [ $? -eq 0 ]; then
log INFO "All network interfaces are not functional, considering the other host is down. Keep Ceph running."
RETVAL=0
else
log INFO "Ceph network interface is not functional in duplex-direct, stopping Ceph."
stop
RETVAL=1
fi
else
log INFO "Ceph network interface is not functional, stopping Ceph."
stop
RETVAL=1
fi
else
RETVAL=1
fi
fi
# NOTE: The Status return is only used in the Start method to validate that there