Create LVM snapshot-based boot recovery
As part of an effort to minimize the time to recover an AIO-SX system in case of a boot failure during an upgrade, this commit creates a service that executes during the boot sequence and: 1. Verifies if system is booted from the rollback deployment 2. Checks if current ostree commit-id matches from-release commit-id 3. If 1 and 2 are met, attempts to restore the LVM snapshots The delete_older_deployments function is changed as well, to find the active deployment (in this case not the first in the ostree admin status output) and only allow removing deployments with indexes greater than the active. Test Plan PASS: AIO-SX install/bootstrap/unlock PASS: AIO-SX upgrade stx-10 -> stx-11 using LVM snapshots feature, force boot failure by using the grub flag, verify the service executes and restore the snapshots Story: 2011357 Task: 52265 Change-Id: I38836f03301b4b2c3cb2c2e288e66c53f4c0b07e Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
This commit is contained in:
@@ -43,6 +43,8 @@ override_dh_install:
|
||||
${ROOT}/etc/init.d/software-controller
|
||||
install -m 500 service-files/usm-initialize-init.sh \
|
||||
${ROOT}/etc/init.d/usm-initialize
|
||||
install -m 500 service-files/lvm-snapshot-restore.sh \
|
||||
${ROOT}/etc/init.d/lvm-snapshot-restore
|
||||
install -m 600 service-files/software.conf \
|
||||
${ROOT}/etc/software/software.conf
|
||||
install -m 644 service-files/policy.json \
|
||||
|
14
software/service-files/lvm-snapshot-restore.service
Normal file
14
software/service-files/lvm-snapshot-restore.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=Restore LVM Snapshots
|
||||
DefaultDependencies=no
|
||||
After=systemd-udev-settle.service local-fs.target var-log.mount
|
||||
Before=software.service controllerconfig.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/etc/init.d/lvm-snapshot-restore
|
||||
TimeoutStartSec=300
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
64
software/service-files/lvm-snapshot-restore.sh
Normal file
64
software/service-files/lvm-snapshot-restore.sh
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
### BEGIN INIT INFO
|
||||
# Description: lvm-snapshot-restore
|
||||
#
|
||||
# Short-Description: Restore LVM Snapshots
|
||||
# Provides: lvm-snapshot-restore
|
||||
# Required-Start:
|
||||
# Required-Stop:
|
||||
# Default-Start: 3 5
|
||||
# Default-Stop: 3 5
|
||||
### END INIT INFO
|
||||
|
||||
NAME=$(basename $0)
|
||||
LOG_FILE="/var/log/lvm-snapshot-restore.log"
|
||||
RESTORE_SCRIPT="/usr/sbin/software-deploy/manage-lvm-snapshots"
|
||||
|
||||
# Function to log messages to both stdout and log file
|
||||
log() {
|
||||
echo "$(date '+%FT%T.%3N'): $NAME: $*" >> $LOG_FILE
|
||||
}
|
||||
|
||||
# Detect if the system booted into the previous deployment
|
||||
if ! grep -q "ostree=/ostree/2" /proc/cmdline; then
|
||||
log "System is not booted from the rollback deployment."
|
||||
exit 0
|
||||
fi
|
||||
log "System is booted from rollback deployment."
|
||||
|
||||
# Verify if deployed commit-id matches rollback ostree commit-id
|
||||
source /etc/build.info
|
||||
log "Rollback major release version is ${SW_VERSION}"
|
||||
DEPLOYED_COMMIT_ID=$(ostree admin status | grep "^\*" | awk '{ sub(/\.[0-9]+/, "", $3); print $3 }')
|
||||
ROLLBACK_COMMIT_ID=$(ostree --repo=/var/www/pages/feed/rel-${SW_VERSION}/ostree_repo rev-parse starlingx)
|
||||
if [ ! $DEPLOYED_COMMIT_ID = $ROLLBACK_COMMIT_ID ]; then
|
||||
log "Deployed ostree commit-id doesn't match ${SW_VERSION} ostree commit-id"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Checking LVM snapshots..."
|
||||
${RESTORE_SCRIPT} --list
|
||||
if [ $? -ne 0 ]; then
|
||||
log "No LVM snapshots to restore."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Starting LVM snapshot restore..."
|
||||
${RESTORE_SCRIPT} --restore
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
log "All LVM snapshots restored successfully. Rebooting..."
|
||||
reboot
|
||||
else
|
||||
log "Couldn't restore the LVM snapshots, lvdisplay output:"
|
||||
log "$(lvdisplay)"
|
||||
log "Check software.log for more details."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@@ -39,29 +39,6 @@ function LOG_TO_FILE {
|
||||
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
|
||||
}
|
||||
|
||||
function check_for_rr_software_update {
|
||||
if [ -f ${node_is_software_updated_rr_file} ]; then
|
||||
if [ ! -f ${software_updated_during_init_file} ]; then
|
||||
echo
|
||||
echo "Node has had its software updated and requires an immediate reboot."
|
||||
echo
|
||||
LOG_TO_FILE "Node has had its software updated, with reboot-required flag set. Rebooting"
|
||||
touch ${software_updated_during_init_file}
|
||||
/sbin/reboot
|
||||
else
|
||||
echo
|
||||
echo "Node has had its software updated during init a second consecutive time. Skipping reboot due to possible error"
|
||||
echo
|
||||
LOG_TO_FILE "Node has had its software updated during init a second consecutive time. Skipping reboot due to possible error"
|
||||
touch ${software_install_failed_file}
|
||||
rm -f ${software_updated_during_init_file}
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
rm -f ${software_updated_during_init_file}
|
||||
fi
|
||||
}
|
||||
|
||||
function check_install_uuid {
|
||||
# Check whether our installed load matches the active controller
|
||||
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
|
||||
@@ -158,7 +135,6 @@ case "$1" in
|
||||
LOG_TO_FILE "***** Finished software operation *****"
|
||||
fi
|
||||
|
||||
check_for_rr_software_update
|
||||
;;
|
||||
stop)
|
||||
# Nothing to do here
|
||||
|
@@ -12,6 +12,7 @@ from datetime import datetime
|
||||
from datetime import timezone
|
||||
import json
|
||||
import logging
|
||||
from packaging import version
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import subprocess
|
||||
@@ -170,6 +171,11 @@ class VarSnapshot(LVMSnapshot):
|
||||
deploy = content.get("deploy")
|
||||
for d in deploy:
|
||||
d["state"] = "host-rollback-done"
|
||||
from_release = d["from_release"]
|
||||
to_release = d["to_release"]
|
||||
if version.Version(to_release) > version.Version(from_release):
|
||||
d["from_release"] = to_release
|
||||
d["to_release"] = from_release
|
||||
with open(software_json, "w") as fp:
|
||||
fp.write(json.dumps(content))
|
||||
LOG.info("Deployment data updated")
|
||||
@@ -375,6 +381,7 @@ def main():
|
||||
manager.delete_snapshots()
|
||||
elif args.list:
|
||||
snapshots = [snapshot.to_json() for snapshot in manager.list_snapshots()]
|
||||
success = bool(snapshots) # True is snapshots exists, False otherwise
|
||||
print(json.dumps(snapshots, indent=4))
|
||||
else:
|
||||
parser.print_usage()
|
||||
|
@@ -498,19 +498,17 @@ def delete_older_deployments():
|
||||
# Sample command and output that is parsed to get the list of
|
||||
# deployment IDs
|
||||
#
|
||||
# Command: ostree admin status | grep debian
|
||||
# Command: ostree admin status | egrep 'debian [a-z0-9]+'
|
||||
#
|
||||
# Output:
|
||||
#
|
||||
# * debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.2
|
||||
# debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.1 (rollback)
|
||||
# debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.0
|
||||
|
||||
LOG.info("Inside delete_older_deployments of ostree_utils")
|
||||
cmd = "ostree admin status | grep debian"
|
||||
# * debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.2
|
||||
# debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.1 (rollback)
|
||||
# debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.0
|
||||
|
||||
cmd = "ostree admin status | egrep 'debian [a-z0-9]+'"
|
||||
try:
|
||||
output = subprocess.run(cmd, shell=True, check=True, capture_output=True)
|
||||
output = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
msg = "Failed to fetch ostree admin status."
|
||||
info_msg = "OSTree Admin Status Error: return code: %s , Output: %s" \
|
||||
@@ -518,68 +516,34 @@ def delete_older_deployments():
|
||||
LOG.info(info_msg)
|
||||
raise OSTreeCommandFail(msg)
|
||||
|
||||
# Store the output of the above command in a string
|
||||
output_string = output.stdout.decode('utf-8')
|
||||
# Find the active deployment (which usually is the first, but there are exceptions)
|
||||
# and once found attempt to delete deployments after it in the list, except the rollback
|
||||
delete_deployments = False
|
||||
deployments_to_delete = []
|
||||
for index, deployment in enumerate(output.stdout.strip().split("\n")):
|
||||
if delete_deployments and "rollback" not in deployment:
|
||||
deployments_to_delete.append(index)
|
||||
if "*" in deployment:
|
||||
LOG.info("Active deployment %s: %s", index, deployment)
|
||||
delete_deployments = True
|
||||
|
||||
# Parse the string to get the latest commit for the ostree
|
||||
split_output_string = output_string.split()
|
||||
deployment_id_list = []
|
||||
for index, deployment_id in enumerate(split_output_string):
|
||||
if deployment_id == "debian":
|
||||
deployment_id_list.append(split_output_string[index + 1])
|
||||
|
||||
# After a reboot, the deployment ID at the 0th index of the list
|
||||
# is always the active deployment and the deployment ID at the
|
||||
# 1st index of the list is always the fallback deployment.
|
||||
# We want to delete all deployments except the two mentioned above.
|
||||
# This means we will undeploy all deployments starting from the
|
||||
# 2nd index of deployment_id_list
|
||||
deploys_amount = len(deployment_id_list)
|
||||
if deploys_amount <= 2:
|
||||
if not deployments_to_delete:
|
||||
LOG.info("No older deployments to delete")
|
||||
return
|
||||
return True
|
||||
|
||||
for index in reversed(range(2, deploys_amount)):
|
||||
for index in reversed(deployments_to_delete):
|
||||
try:
|
||||
cmd = "ostree admin undeploy %s" % index
|
||||
output = subprocess.run(cmd, shell=True, check=True, capture_output=True)
|
||||
info_log = "Deleted ostree deployment %s" % deployment_id_list[index]
|
||||
output = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
|
||||
info_log = "Deleted ostree deployment %s: %s" % (index, output.stdout)
|
||||
LOG.info(info_log)
|
||||
except subprocess.CalledProcessError as e:
|
||||
msg = "Failed to undeploy ostree deployment %s." % deployment_id_list[index]
|
||||
msg = "Failed to undeploy ostree deployment %s." % index
|
||||
info_msg = "OSTree Undeploy Error: return code: %s , Output: %s" \
|
||||
% (e.returncode, e.stderr.decode("utf-8"))
|
||||
% (e.returncode, e.stderr)
|
||||
LOG.info(info_msg)
|
||||
raise OSTreeCommandFail(msg)
|
||||
|
||||
|
||||
def undeploy_inactive_deployments():
|
||||
"""
|
||||
Remove deployments other than the current deployment,
|
||||
i.e. deployments from index 1 to len(deployments) - 1,
|
||||
in the reverse order, from the oldest to the newest
|
||||
"""
|
||||
cmd = ["ostree", "admin", "status"]
|
||||
try:
|
||||
output = subprocess.run(cmd, text=True, check=True, capture_output=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
LOG.exception("Error getting ostree deployment list: %s" % e.stderr)
|
||||
return False
|
||||
|
||||
success = True
|
||||
pattern = r"debian [a-z0-9]+.[0-9]+"
|
||||
deployments = re.findall(pattern, output.stdout)
|
||||
# skip the first (active) deployment
|
||||
for index, deployment in reversed(list(enumerate(deployments[1:], 1))):
|
||||
commit_id = deployment.replace("debian ", "").split(".")[0]
|
||||
cmd = ["ostree", "admin", "undeploy", str(index)]
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
LOG.info("Removed deployment %s, commit-id %s" % (index, commit_id))
|
||||
except subprocess.CalledProcessError as e:
|
||||
LOG.exception("Error removing deployment %s, commit-id %s: %s" % (index, commit_id, e.stderr))
|
||||
success = False
|
||||
return success
|
||||
return True
|
||||
|
||||
|
||||
def checkout_latest_ostree_commit(patch_sw_version):
|
||||
|
@@ -375,7 +375,7 @@ class SoftwareMessageDeployDeleteCleanupReq(messages.PatchMessage):
|
||||
success_remove_upgrade_flags = remove_major_release_deployment_flags()
|
||||
|
||||
# undeploy the from-release ostree deployment to free sysroot disk space
|
||||
success_ostree_undeploy_from_release = ostree_utils.undeploy_inactive_deployments()
|
||||
success_ostree_undeploy_from_release = ostree_utils.delete_older_deployments()
|
||||
|
||||
cleanup_results = [
|
||||
(success_ostree_remote_cleanup, "cleaning temporary refs/remotes"),
|
||||
|
Reference in New Issue
Block a user