Files
update/software/service-files/software-init.sh
Heitor Matsui fd90264778 Create LVM snapshot-based boot recovery
As part of an effort to minimize the time to recover an
AIO-SX system in case of a boot failure during an upgrade,
this commit creates a service that executes during the boot
sequence and:

1. Verifies if system is booted from the rollback deployment
2. Checks if current ostree commit-id matches from-release commit-id
3. If 1 and 2 are met, attempts to restore the LVM snapshots

The delete_older_deployments function is changed as well, to
find the active deployment (in this case not the first in the
ostree admin status output) and only allow removing deployments
with indexes greater than the active.

Test Plan
PASS: AIO-SX install/bootstrap/unlock
PASS: AIO-SX upgrade stx-10 -> stx-11 using LVM snapshots
      feature, force boot failure by using the grub flag, verify
      the service executes and restore the snapshots

Story: 2011357
Task: 52265

Change-Id: I38836f03301b4b2c3cb2c2e288e66c53f4c0b07e
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
2025-06-16 14:31:11 -03:00

158 lines
5.1 KiB
Bash

#!/bin/bash
#
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Unified Software Management
# chkconfig: 345 20 23
# description: StarlingX Unified Software Management init script
### BEGIN INIT INFO
# Provides: software
# Required-Start: $syslog
# Required-Stop: $syslog
# Default-Start: 2 3 5
# Default-Stop: 0 1 6
# Short-Description: software
# Description: Provides the Unified Software Management component
### END INIT INFO
NAME=$(basename $0)
. /usr/bin/tsconfig
. /etc/platform/platform.conf
logfile=/var/log/software.log
software_install_failed_file=/var/run/software_install_failed
software_updated_during_init_file=/etc/software/.software_updated_during_init
node_is_software_updated_rr_file=/var/persist/software-agent/node_is_software_updated_rr
# if the system has never been bootstrapped, system_mode is not set
# treat a non bootstrapped system like it is simplex
# and manually manage lighttpd, etc..
if [ "${system_mode}" = "" ]; then
system_mode="simplex"
fi
function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
}
function check_install_uuid {
# Check whether our installed load matches the active controller
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
if [ $? -ne 0 ]; then
if [ "$HOSTNAME" = "controller-1" ]; then
# If we're on controller-1, controller-0 may not have the install_uuid
# matching this release, if we're in an upgrade. If the file doesn't exist,
# bypass this check
return 0
fi
LOG_TO_FILE "Unable to retrieve installation uuid from active controller"
echo "Unable to retrieve installation uuid from active controller"
return 1
fi
if [ "$INSTALL_UUID" != "$CONTROLLER_UUID" ]; then
LOG_TO_FILE "This node is running a different load than the active controller and must be reinstalled"
echo "This node is running a different load than the active controller and must be reinstalled"
return 1
fi
return 0
}
# Check for installation failure
if [ -f /etc/platform/installation_failed ] ; then
LOG_TO_FILE "/etc/platform/installation_failed flag is set. Aborting."
echo "$(basename $0): Detected installation failure. Aborting."
exit 1
fi
# For AIO-SX, abort if config is not yet applied and this is running in init
if [ "${system_mode}" = "simplex" -a ! -f ${INITIAL_CONTROLLER_CONFIG_COMPLETE} -a "$1" = "start" ]; then
LOG_TO_FILE "Config is not yet applied. Skipping init software"
exit 0
fi
# If the management interface is bonded, it may take some time
# before communications can be properly setup.
# Allow up to $DELAY_SEC seconds to reach controller.
DELAY_SEC=120
START=`date +%s`
FOUND=0
while [ $(date +%s) -lt $(( ${START} + ${DELAY_SEC} )) ]; do
LOG_TO_FILE "Waiting for controller to be pingable"
ping -c 1 controller > /dev/null 2>&1 || ping6 -c 1 controller > /dev/null 2>&1
if [ $? -eq 0 ]; then
LOG_TO_FILE "controller is pingable"
FOUND=1
break
fi
sleep 1
done
if [ ${FOUND} -eq 0 ]; then
# 'controller' is not available, just exit
LOG_TO_FILE "Unable to contact active controller (controller). Boot will continue."
exit 1
fi
RC=0
case "$1" in
start)
if [ "${system_mode}" = "simplex" ]; then
# On a simplex CPE, we need to launch the http server first,
# before we can do the software installation
LOG_TO_FILE "***** Launching lighttpd *****"
/etc/init.d/lighttpd start
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
LOG_TO_FILE "***** Shutting down lighttpd *****"
/etc/init.d/lighttpd stop
else
check_install_uuid
if [ $? -ne 0 ]; then
# The INSTALL_UUID doesn't match the active controller, so exit
exit 1
fi
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
fi
;;
stop)
# Nothing to do here
;;
restart)
LOG_TO_FILE "***** Starting software operation *****"
/usr/bin/software-agent --install 2>>$logfile
if [ -f ${software_install_failed_file} ]; then
RC=1
LOG_TO_FILE "***** Software operation failed *****"
fi
LOG_TO_FILE "***** Finished software operation *****"
;;
*)
echo "Usage: $0 {start|stop|restart}"
exit 1
esac
exit $RC