From 77b414114a132c41760fcf661a5e9cd59def479c Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 24 Sep 2025 12:25:21 -0400 Subject: [PATCH] Mtce power off action is declared successful before verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bmcUtil_is_power_on function is declaring power off while the reading is not reporting "on" rather than waiting to see the "off". Some servers report a transient value of "poweringoff" during the powering off process. The reading not showing "off" makes the function prematurely declare power as off. This update adds an explicit check for the "off" state before declaring the server is powered off'. Similarly for 'on'. This update addresses 2 additional issues found during debug and testing that needed to be fixed to properly test the fix of the original issue. The issues: - in the presence of persistent power control failures, the Power-On and Power-Off FSMs are not timing out. - power off action (when power-off is done) phase produces a power-on action customer log. This update drives consistency into and fixes power action try and retry handling in the following ways: - each power (on/off) action starts with a query of the current state - each power (on/off) action has retries command actions up to 5 times - each action has 10 status query retries with - 5 second power status retry delay for power on - 0 second power action retry delay for power on - 10 second power status retry delay for power off - 30 second power action retry delay for power off - power off fails with max retries after 12 minutes - power on fails with max retries after 3 minutes - each power (on/off) command start produces appropriate customer command start log ; in adminActionChange function. - each power (on/off) acton completion produces appropriate customer action complete log ; in corresponding action FSM. Failure path testing changes for the redfish protocol: - added compile-in FIT capability to test send, recv and reading value failure paths. - adds a debug option that adds the ability to configure mtcAgent, through manual update to /etc/mtc.conf, to save redfishtool request responses in dated files for debug purposes. This is similar to what was added for the hardware monitor in https://opendev.org/starlingx/metal/commit/ a1256a3c322c8f2372de265639a2e3fa504a8b1a Test Plan: IPV4 and IPV6 verified PASS: Verify AIO DX system intall with BMC support. PASS: Verify Power-off handling on servers that - do report intermediate power state change values - do not report intermediate power state change values Power On: values can vary from server to server PASS: Verify power on handling while already powered on ; 5 secs PASS: Verify power on success handling; 10 secs (no retries) PASS: Verify power on handling with partial retries PASS: Verify power on handling with full retries - 3 minutes PASS: Verify power on handling while handling power on PASS: Verify power on handling while handling power off PASS: Verify power on admin action command produces power on command customer log PASS: Verify power on status change produces power on action customer log Power Off: values can vary from server to server PASS: Verify power off handling while already powered off - 5 secs PASS: Verify power off success handling; 45 secs (no retries) PASS: Verify power off handling with partial retries PASS: Verify power off handling with full retries ; 15 minutes PASS: Verify power off handling while handling power off PASS: Verify power off handling while handling power on PASS: Verify power off admin action command produces power off command customer log PASS: Verify power off status change produces power off action customer log Regression: PASS: Verify system node reset with BMC provisioned PASS: Verify system node pxeboot install with BMC provisioned PASS: Verify power control success, failure and retry logging PASS: Verify Horizon host status for all success & failure cases above Closes-Bug: 2125596 Closes-Bug: 2125927 Change-Id: I16b35754ff700f9f6a92b090721b8763ad6933d5 Signed-off-by: Eric MacDonald --- mtce-common/src/common/bmcUtil.cpp | 33 +++- mtce-common/src/common/bmcUtil.h | 1 + mtce-common/src/common/fitCodes.h | 5 + mtce-common/src/common/nodeBase.cpp | 4 +- mtce-common/src/common/nodeBase.h | 10 +- mtce-common/src/common/nodeTimers.h | 10 +- mtce/src/common/nodeClass.cpp | 29 +-- mtce/src/common/nodeClass.h | 8 + mtce/src/maintenance/mtcNodeHdlrs.cpp | 254 +++++++++++++++++++++----- mtce/src/maintenance/mtcThreads.cpp | 8 + 10 files changed, 275 insertions(+), 87 deletions(-) diff --git a/mtce-common/src/common/bmcUtil.cpp b/mtce-common/src/common/bmcUtil.cpp index efbb3178..4693d117 100644 --- a/mtce-common/src/common/bmcUtil.cpp +++ b/mtce-common/src/common/bmcUtil.cpp @@ -329,6 +329,8 @@ int bmcUtil_is_power_on ( string hostname, string & response, bool & power_on) { + string reading = "unknown" ; + bool power_state_unknown = false ; if ( response.empty() ) { wlog ("%s bmc power status query response empty", @@ -343,10 +345,26 @@ int bmcUtil_is_power_on ( string hostname, wlog ("%s failed to tokenize bmc info", hostname.c_str()); return (FAIL_JSON_PARSE) ; } - else if (tolowercase(jsonUtil_get_key_value_string(json_obj,REDFISH_LABEL__POWER_STATE)) == "on" ) + + reading = tolowercase(jsonUtil_get_key_value_string(json_obj,REDFISH_LABEL__POWER_STATE)); + +#ifdef WANT_FIT_TESTING + string test_reading = "" ; + if ( daemon_want_fit ( FIT_CODE__BMC_POWER_STATE, hostname, "power", test_reading )) + reading = test_reading ; +#endif + if ( reading == "on" ) + { + blog1("%s power is on", hostname.c_str()); power_on = true ; - else + } + else if ( reading == "off" ) + { + blog1("%s power is off", hostname.c_str()); power_on = false ; + } + else + power_state_unknown = true ; /* free the json object */ json_object_put(json_obj ); @@ -355,8 +373,17 @@ int bmcUtil_is_power_on ( string hostname, { if ( response.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos ) power_on = true ; - else + else if ( response.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos ) power_on = false ; + else + power_state_unknown = true ; + } + if ( power_state_unknown ) + { + ilog ("%s power state is '%s' ; last known state was '%s'", + hostname.c_str(), + reading.c_str(), + power_on ? "On":"Off"); } return (PASS); } diff --git a/mtce-common/src/common/bmcUtil.h b/mtce-common/src/common/bmcUtil.h index 3c7dd94f..658ad558 100644 --- a/mtce-common/src/common/bmcUtil.h +++ b/mtce-common/src/common/bmcUtil.h @@ -30,6 +30,7 @@ using namespace std; #define BMC_PROTOCOL__REDFISHTOOL_STR ((const char *)("redfishtool")) #define WANT_DATED_IPMI_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/ipmitool/want_dated_sensor_data_files")) #define WANT_DATED_REDFISH_SENSOR_DATA_FILES ((const char *)("/var/run/bmc/redfishtool/want_dated_sensor_data_files")) +#define WANT_DATED_REDFISH_MTCE_DATA_FILES ((const char *)("/var/run/bmc/redfishtool/want_dated_mtce_data_files")) /* learned graceful and immediate power control command strings */ typedef struct diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h index dd1e8df3..a7367d56 100644 --- a/mtce-common/src/common/fitCodes.h +++ b/mtce-common/src/common/fitCodes.h @@ -130,6 +130,11 @@ #define FIT_CODE__BMC_COMMAND_SEND (60) #define FIT_CODE__BMC_COMMAND_RECV (61) +#define FIT_CODE__BMC_POWER_REQUEST_SEND (62) +#define FIT_CODE__BMC_POWER_REQUEST_RECV (63) +#define FIT_CODE__BMC_POWER_STATUS_SEND (64) +#define FIT_CODE__BMC_POWER_STATUS_RECV (65) +#define FIT_CODE__BMC_POWER_STATE (66) #define FIT_CODE__START_HOST_SERVICES (70) #define FIT_CODE__STOP_HOST_SERVICES (71) diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 0f316e6a..95d89b5a 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -418,8 +418,8 @@ void mtc_stages_init ( void ) powerStages_str [MTC_POWEROFF__REQ_SEND ] = "Power-Off-Req-Send"; powerStages_str [MTC_POWEROFF__RESP_WAIT ] = "Power-Off-Resp-Wait"; powerStages_str [MTC_POWEROFF__OFFLINE_WAIT ] = "Power-Off-Offline-Wait"; - powerStages_str [MTC_POWEROFF__POWERQRY ] = "Power-Off-Power-Query"; - powerStages_str [MTC_POWEROFF__POWERQRY_WAIT ] = "Power-Off-Power-Query-Wait"; + powerStages_str [MTC_POWEROFF__POWER_STATUS ] = "Power-Off-Status"; + powerStages_str [MTC_POWEROFF__POWER_STATUS_WAIT ] = "Power-Off-Status-Wait"; powerStages_str [MTC_POWEROFF__QUEUE ] = "Power-Off-Queue"; powerStages_str [MTC_POWEROFF__DONE ] = "Power-Off-Done"; powerStages_str [MTC_POWEROFF__FAIL ] = "Power-Off-Fail"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index f7040080..5e11a275 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -357,7 +357,10 @@ typedef enum /* Define Reset and Power Action retry controls ; delay, count and switch threshold */ #define MTC_POWER_ACTION_QUERY_WAIT (30) #define MTC_POWER_ACTION_RETRY_DELAY (20) -#define MTC_POWER_ACTION_RETRY_COUNT (10) +#define MTC_POWER_STATUS_RETRY_COUNT (10) +#define MTC_POWEROFF_STATUS_RETRY_DELAY (10) +#define MTC_POWERON_STATUS_RETRY_DELAY (5) +#define MTC_POWER_ACTION_RETRY_COUNT (5) #define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2) #define MTC_RESET_ACTION_RETRY_DELAY (20) #define MTC_RESET_ACTION_RETRY_COUNT (10) @@ -1153,8 +1156,8 @@ typedef enum MTC_POWEROFF__FAIL_WAIT, MTC_POWEROFF__QUEUE, MTC_POWEROFF__OFFLINE_WAIT, - MTC_POWEROFF__POWERQRY, - MTC_POWEROFF__POWERQRY_WAIT, + MTC_POWEROFF__POWER_STATUS, + MTC_POWEROFF__POWER_STATUS_WAIT, MTC_POWER__DONE, /* clear power action */ MTC_POWER__STAGES } mtc_powerStages_enum ; @@ -1397,6 +1400,7 @@ void mem_log ( string label, int value, string data ); /* KPI strings */ #define KPI_STR__START "start" #define KPI_STR__COMPLETE "complete" +#define KPI_STR__FAIL "fail" #define KPI_STR__PROCESS_STARTUP "startup" #define KPI_STR__REINSTALL "reinstall" #define KPI_STR__POWER_STATE_LEARNED "power state learned" diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index d6881549..9cf7a323 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -86,7 +86,6 @@ #define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1) #define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4) #define MTC_REINSTALL_WAIT_TIMER (10) -#define MTC_BMC_REQUEST_DELAY (10) /* consider making this shorter */ #define LAZY_REBOOT_RETRY_DELAY_SECS (60) #define SM_NOTIFY_UNHEALTHY_DELAY_SECS (5) #define MTC_MIN_ONLINE_PERIOD_SECS (7) @@ -95,8 +94,13 @@ #define MTC_AGENT_TIMEOUT_EXTENSION (5) #define MTC_LOCK_CEPH_DELAY (90) -#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT) -#define MTC_RECV_WAIT (MTC_RETRY_WAIT) +/* Deprecated: MTC_BMC_REQUEST_DELAY + * This 10 second delay is too long for time critical operations. */ +#define MTC_BMC_REQUEST_DELAY (10) +/* New developments in time critical operations should + * begin to use these shorter BMC request delays */ +#define MTC_RECV_WAIT (5) /* secs to wait before rx after a request */ +#define MTC_RECV_RETRY_WAIT (5) /* secs to wait performing a rx retry */ /** Host must stay enabled for this long for the * failed_recovery_counter to get cleared */ diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 9444ce1c..2c1bc19f 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1521,33 +1521,6 @@ int nodeLinkClass::avail_status_change ( string hostname, this->ctl_mtcAlive_gate ( node_ptr, false ) ; } - /* check for need to generate power on log */ - if (( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) && - ( avail != MTC_AVAIL_STATUS__POWERED_OFF )) - { - if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERON ) - { - mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_ON ); - } - else - { - mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_ON ); - } - } - - /* check for need to generate power off log */ - if (( node_ptr->availStatus != MTC_AVAIL_STATUS__POWERED_OFF ) && - ( avail == MTC_AVAIL_STATUS__POWERED_OFF )) - { - if ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWEROFF ) - { - mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF ); - } - else - { - mtcAlarm_log ( hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_OFF ); - } - } /* check for need to generate online log */ if (( node_ptr->availStatus != MTC_AVAIL_STATUS__ONLINE ) && @@ -7285,6 +7258,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr, dlog ("%s admin power-on action started started at %s", node_ptr->hostname.c_str(), node_ptr->start_poweron_time.time_buff); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_ON ); break ; } case MTC_ADMIN_ACTION__RECOVER: @@ -7308,6 +7282,7 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr, dlog ("%s admin power-off action started at %s", node_ptr->hostname.c_str(), node_ptr->start_poweroff_time.time_buff); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF ); break ; } case MTC_ADMIN_ACTION__DELETE: diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 1a874ae2..7d2f42a9 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -181,6 +181,14 @@ private: /* Retry counter for power actions (on/off)*/ int power_action_retries ; + /* Retry counter for power state change query retries */ + int power_status_retries ; + + /* Set true at the start of a power action. + * Used to force the, power on or off, action + * following the first status query. */ + bool power_action_first ; + /** Generic toggle switch */ bool toggle ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 2f365c11..897e90c6 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -5022,6 +5022,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ) ; mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); powerStageChange ( node_ptr , MTC_POWEROFF__FAIL_WAIT ); + + kpi_log ( node_ptr->hostname, KPI_AREA__ACTION, + KPI_ACTION__POWEROFF, + KPI_STR__START, + KPI_STR__FAIL, + node_ptr->start_poweroff_time); break ; } case MTC_POWEROFF__FAIL_WAIT: @@ -5055,17 +5061,25 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc ); } - node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; + if ( node_ptr->availStatus == MTC_AVAIL_STATUS__POWERED_OFF ) + { + wlog ("%s power should already be off", node_ptr->hostname.c_str()); + } + /* These are count 'down' counters */ + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; /* power action retries before failure */ + node_ptr->power_status_retries = MTC_POWER_STATUS_RETRY_COUNT ; /* power status queries before next power action */ /* don't allow a timeout of zero to be passed in */ - if ( power_off_retry_wait == 0 ) - power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; + if ( this->power_off_retry_wait == 0 ) + this->power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; ilog ("%s power off retry wait is %d seconds", - node_ptr->hostname.c_str(), power_off_retry_wait); + node_ptr->hostname.c_str(), this->power_off_retry_wait); + /* Query the state first and make that immediate */ + node_ptr->power_action_first = true ; mtcTimer_reset ( node_ptr->mtcTimer ) ; - powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + powerStageChange ( node_ptr , MTC_POWEROFF__POWER_STATUS ); break ; } case MTC_POWEROFF__REQ_SEND: @@ -5091,7 +5105,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) else { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); +#ifdef WANT_FIT_TESTING + if ( daemon_want_fit ( FIT_CODE__BMC_POWER_REQUEST_SEND, node_ptr->hostname )) + rc = daemon_get_file_int ("/var/run/return_code") ; + else +#endif + { + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); + } if ( rc ) { wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc ); @@ -5121,7 +5142,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) else if ( rc ) { elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + powerStageChange ( node_ptr , MTC_POWEROFF__POWER_STATUS ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else @@ -5144,7 +5165,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); plog ("%s is now offline\n", node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + powerStageChange ( node_ptr , MTC_POWEROFF__POWER_STATUS ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) @@ -5154,7 +5175,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } break ; } - case MTC_POWEROFF__POWERQRY: + case MTC_POWEROFF__POWER_STATUS: { /* give the power off action some time to complete */ if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) @@ -5173,7 +5194,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + powerStageChange ( node_ptr , MTC_POWEROFF__POWER_STATUS_WAIT ); } mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); } @@ -5184,7 +5205,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } break ; } - case MTC_POWEROFF__POWERQRY_WAIT: + case MTC_POWEROFF__POWER_STATUS_WAIT: { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { @@ -5222,9 +5243,55 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->power_on == true ) { - ilog ("%s Power not Off ; retry power-off ", - node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + /* If this is the first query just gets the current state. + * Issue the power off request. */ + if ( node_ptr->power_action_first ) + { + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + + /* issue the power off command immediately - timer is already expired */ + node_ptr->power_action_first = false ; + } + /* If we get here then this is not the first power status query. + * Handle it as a regular power status query while retries remain. */ + else if ( --node_ptr->power_status_retries > 0 ) + { + powerStageChange ( node_ptr , MTC_POWEROFF__POWER_STATUS ); + + ilog ("%s power not off ; power status retry %d of %d in %d secs (power action retry %d of %d)", + node_ptr->hostname.c_str(), + MTC_POWER_STATUS_RETRY_COUNT - node_ptr->power_status_retries, + MTC_POWER_STATUS_RETRY_COUNT, + MTC_POWEROFF_STATUS_RETRY_DELAY, + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT); + + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWEROFF_STATUS_RETRY_DELAY ); + } + /* If we get here then power status retries have mxed out. + * Issue the power off action command again. */ + else if ( --node_ptr->power_action_retries > 0 ) + { + ilog ("%s power off action retry %d of %d in %d secs", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT, + this->power_off_retry_wait); + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, this->power_off_retry_wait ); + + /* reload the power status query countdown counter */ + node_ptr->power_status_retries = MTC_POWER_STATUS_RETRY_COUNT ; + } + else /* Max retries reached */ + { + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + wlog ("%s power not 'off' ; max retries", + node_ptr->hostname.c_str()); + + /* correct the counter when transitioning to QUEUE */ + node_ptr->power_action_retries++ ; + } } else { @@ -5236,6 +5303,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) KPI_STR__START, KPI_STR__COMPLETE, node_ptr->start_poweroff_time); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_OFF ); powerStageChange ( node_ptr , MTC_POWEROFF__DONE ); mtcTimer_reset ( node_ptr->mtcTimer ); break ; @@ -5271,20 +5339,25 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->bmc_thread_info.status ); } powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); - ilog ("%s waiting %d seconds before next power off retry", - node_ptr->hostname.c_str(), power_off_retry_wait); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait ); + /* No delay for the action retry ; delay is in the power query */ + mtcTimer_reset ( node_ptr->mtcTimer ); } else { powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); } + + /* reload for next action retry */ + node_ptr->power_status_retries = MTC_POWER_STATUS_RETRY_COUNT ; + node_ptr->power_action_first = false ; break ; } case MTC_POWEROFF__DONE: { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { + node_ptr->power_action_first = false ; + plog ("%s Power-Off Completed\n", node_ptr->hostname.c_str()); stop_offline_handler ( node_ptr ); @@ -5307,6 +5380,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ) ; mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__FAIL_WAIT ); + + kpi_log ( node_ptr->hostname, KPI_AREA__ACTION, + KPI_ACTION__POWERON, + KPI_STR__START, + KPI_STR__FAIL, + node_ptr->start_poweron_time); break ; } case MTC_POWERON__FAIL_WAIT: @@ -5345,7 +5424,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc ); } - node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; + /* These are count 'down' counters */ + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; /* power action retries before failure */ + node_ptr->power_status_retries = MTC_POWER_STATUS_RETRY_COUNT ; /* power status queries before next power action */ + + /* Query the state first and make that immediate */ + node_ptr->power_action_first = true ; + powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); //the fall through to MTC_POWERON__REQ_SEND is intentional MTCE_FALLTHROUGH; @@ -5364,7 +5449,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) break ; } - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; +#ifdef WANT_FIT_TESTING + if ( daemon_want_fit ( FIT_CODE__BMC_POWER_STATUS_SEND, node_ptr->hostname )) + rc = daemon_get_file_int ("/var/run/return_code") ; + else +#endif + { + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; + } if ( rc ) { powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -5374,17 +5466,24 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS_WAIT ); } mtcTimer_reset ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); break ; } case MTC_POWERON__POWER_STATUS_WAIT: { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - rc = bmc_command_recv ( node_ptr ); +#ifdef WANT_FIT_TESTING + if ( daemon_want_fit ( FIT_CODE__BMC_POWER_STATUS_RECV, node_ptr->hostname )) + rc = daemon_get_file_int ("/var/run/return_code") ; + else +#endif + { + rc = bmc_command_recv ( node_ptr ); + } if ( rc == RETRY ) { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); } else if ( rc == PASS ) { @@ -5400,15 +5499,56 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->power_on ) { - ilog ("%s power is already on ; no action required\n", node_ptr->hostname.c_str()); - mtcInvApi_update_task ( node_ptr, "Power Already On" ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); + ilog ("%s power is on", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, "Power Is On" ); powerStageChange ( node_ptr , MTC_POWERON__DONE ); } + /* If this is the first query then that was just to get the current state. + * If we get here then its not powered 'on' so issue the power on request. + * Remainig power 'on' action retries are handled in the 'Status Wait' case. */ + else if ( node_ptr->power_action_first == true ) + { + powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); + ilog ("%s power is not on", node_ptr->hostname.c_str() ); + node_ptr->power_action_first = false ; + } + /* Otherwise retry the status until the status retry countdown counter + * reaches zero. */ + else if ( --node_ptr->power_status_retries > 0 ) + { + powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); + ilog ("%s power not on ; power status retry %d of %d in %d secs (power action retry %d of %d)", + node_ptr->hostname.c_str(), + MTC_POWER_STATUS_RETRY_COUNT - node_ptr->power_status_retries, + MTC_POWER_STATUS_RETRY_COUNT, + MTC_POWERON_STATUS_RETRY_DELAY, + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT); + + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWERON_STATUS_RETRY_DELAY ); + } + /* The status retry counted down and now we need the power action + * to be re-issued. */ + else if ( --node_ptr->power_action_retries > 0 ) + { + powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); + ilog ("%s power on action retry %d of %d", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT); + + /* reload the power status query countdown counter */ + node_ptr->power_status_retries = MTC_POWER_STATUS_RETRY_COUNT ; + } + /* Max action retries reached. Let the QUEUE stage handle it */ else { - ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() ); - powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); + powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); + wlog ("%s power not 'on' ; max retries", + node_ptr->hostname.c_str()); + + /* correct the counter when transitioning to QUEUE */ + node_ptr->power_action_retries++ ; } } else @@ -5448,7 +5588,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); +#ifdef WANT_FIT_TESTING + if ( daemon_want_fit ( FIT_CODE__BMC_POWER_REQUEST_SEND, node_ptr->hostname )) + rc = daemon_get_file_int ("/var/run/return_code") ; + else +#endif + { + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); + } if ( rc ) { wlog ("%s Power-On request failed (%d)\n", @@ -5462,7 +5609,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) { blog ("%s Power-On requested\n", node_ptr->hostname.c_str()); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); powerStageChange ( node_ptr , MTC_POWERON__RESP_WAIT ); } @@ -5473,12 +5620,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - rc = bmc_command_recv ( node_ptr ); - if ( rc == RETRY ) - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; - } + rc = bmc_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); + break ; + } if ( rc ) { @@ -5489,19 +5636,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) else { ilog ("%s is Powering On\n", node_ptr->hostname.c_str() ); - node_ptr->power_on = true ; - node_ptr->power_on_time_secs = time(NULL); - kpi_log ( node_ptr->hostname, KPI_AREA__ACTION, - KPI_ACTION__POWERON, - KPI_STR__START, - KPI_STR__COMPLETE, - node_ptr->start_poweron_time); + powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); + if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL ) { mtcInvApi_update_task ( node_ptr, "Powering On" ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); - powerStageChange ( node_ptr , MTC_POWERON__DONE ); } } break ; @@ -5526,7 +5666,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->bmc_thread_info.status ); } - powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); + powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); } else { @@ -5539,11 +5679,27 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - plog ("%s Power-On Completed\n", node_ptr->hostname.c_str()); + if ( node_ptr->power_on ) + { + plog ("%s Power-On Completed ; with %d retries", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT-node_ptr->power_action_retries); + node_ptr->power_on_time_secs = time(NULL); + kpi_log ( node_ptr->hostname, KPI_AREA__ACTION, + KPI_ACTION__POWERON, + KPI_STR__START, + KPI_STR__COMPLETE, + node_ptr->start_poweron_time); - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE ); - - powerStageChange ( node_ptr , MTC_POWER__DONE ); + availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE ); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_POWER_ON ); + powerStageChange ( node_ptr , MTC_POWER__DONE ); + } + else + { + plog ("%s Power-On Failed", node_ptr->hostname.c_str()); + powerStageChange ( node_ptr, MTC_POWERON__FAIL ); + } /* If we are powering on a node then there is no need for any further reset progression */ if ( node_ptr->mtcCmd_work_fifo.size() ) diff --git a/mtce/src/maintenance/mtcThreads.cpp b/mtce/src/maintenance/mtcThreads.cpp index f86b2d80..89a3dee3 100644 --- a/mtce/src/maintenance/mtcThreads.cpp +++ b/mtce/src/maintenance/mtcThreads.cpp @@ -324,6 +324,14 @@ void * mtcThread_bmc ( void * arg ) datafile, DEFAULT_SYSTEM_REQUEST_LATENCY_SECS); + /* Debug Option - enable lane debug_bmgt3 = 8 and touch + * /var/run/bmc/redfish/want_dated_mtce_data_files + * ... to save the current datafile with a dated extension + * so that a read history is maintained for debug purposes. */ + if(daemon_get_cfg_ptr()->debug_bmgmt&8) + if ( daemon_is_file_present (WANT_DATED_REDFISH_MTCE_DATA_FILES)) + daemon_copy_file(info_ptr->hostname, datafile.data()); + if ( rc != PASS ) { /* Log the command that failed unless ...