Create LVM snapshot-based boot recovery

As part of an effort to minimize the time to recover an
AIO-SX system in case of a boot failure during an upgrade,
this commit creates a service that executes during the boot
sequence and:

1. Verifies if system is booted from the rollback deployment
2. Checks if current ostree commit-id matches from-release commit-id
3. If 1 and 2 are met, attempts to restore the LVM snapshots

The delete_older_deployments function is changed as well, to
find the active deployment (in this case not the first in the
ostree admin status output) and only allow removing deployments
with indexes greater than the active.

Test Plan
PASS: AIO-SX install/bootstrap/unlock
PASS: AIO-SX upgrade stx-10 -> stx-11 using LVM snapshots
      feature, force boot failure by using the grub flag, verify
      the service executes and restore the snapshots

Story: 2011357
Task: 52265

Change-Id: I38836f03301b4b2c3cb2c2e288e66c53f4c0b07e
Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
This commit is contained in:
Heitor Matsui
2025-05-27 09:50:33 -03:00
parent 45d22a068e
commit fd90264778
7 changed files with 112 additions and 85 deletions

View File

@@ -43,6 +43,8 @@ override_dh_install:
${ROOT}/etc/init.d/software-controller
install -m 500 service-files/usm-initialize-init.sh \
${ROOT}/etc/init.d/usm-initialize
install -m 500 service-files/lvm-snapshot-restore.sh \
${ROOT}/etc/init.d/lvm-snapshot-restore
install -m 600 service-files/software.conf \
${ROOT}/etc/software/software.conf
install -m 644 service-files/policy.json \

View File

@@ -0,0 +1,14 @@
[Unit]
Description=Restore LVM Snapshots
DefaultDependencies=no
After=systemd-udev-settle.service local-fs.target var-log.mount
Before=software.service controllerconfig.service
[Service]
Type=oneshot
ExecStart=/etc/init.d/lvm-snapshot-restore
TimeoutStartSec=300
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,64 @@
#!/bin/bash
#
# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
### BEGIN INIT INFO
# Description: lvm-snapshot-restore
#
# Short-Description: Restore LVM Snapshots
# Provides: lvm-snapshot-restore
# Required-Start:
# Required-Stop:
# Default-Start: 3 5
# Default-Stop: 3 5
### END INIT INFO
NAME=$(basename $0)
LOG_FILE="/var/log/lvm-snapshot-restore.log"
RESTORE_SCRIPT="/usr/sbin/software-deploy/manage-lvm-snapshots"
# Function to log messages to both stdout and log file
log() {
echo "$(date '+%FT%T.%3N'): $NAME: $*" >> $LOG_FILE
}
# Detect if the system booted into the previous deployment
if ! grep -q "ostree=/ostree/2" /proc/cmdline; then
log "System is not booted from the rollback deployment."
exit 0
fi
log "System is booted from rollback deployment."
# Verify if deployed commit-id matches rollback ostree commit-id
source /etc/build.info
log "Rollback major release version is ${SW_VERSION}"
DEPLOYED_COMMIT_ID=$(ostree admin status | grep "^\*" | awk '{ sub(/\.[0-9]+/, "", $3); print $3 }')
ROLLBACK_COMMIT_ID=$(ostree --repo=/var/www/pages/feed/rel-${SW_VERSION}/ostree_repo rev-parse starlingx)
if [ ! $DEPLOYED_COMMIT_ID = $ROLLBACK_COMMIT_ID ]; then
log "Deployed ostree commit-id doesn't match ${SW_VERSION} ostree commit-id"
exit 0
fi
log "Checking LVM snapshots..."
${RESTORE_SCRIPT} --list
if [ $? -ne 0 ]; then
log "No LVM snapshots to restore."
exit 0
fi
log "Starting LVM snapshot restore..."
${RESTORE_SCRIPT} --restore
if [ $? -eq 0 ]; then
log "All LVM snapshots restored successfully. Rebooting..."
reboot
else
log "Couldn't restore the LVM snapshots, lvdisplay output:"
log "$(lvdisplay)"
log "Check software.log for more details."
exit 1
fi
exit 0

View File

@@ -39,29 +39,6 @@ function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
}
function check_for_rr_software_update {
if [ -f ${node_is_software_updated_rr_file} ]; then
if [ ! -f ${software_updated_during_init_file} ]; then
echo
echo "Node has had its software updated and requires an immediate reboot."
echo
LOG_TO_FILE "Node has had its software updated, with reboot-required flag set. Rebooting"
touch ${software_updated_during_init_file}
/sbin/reboot
else
echo
echo "Node has had its software updated during init a second consecutive time. Skipping reboot due to possible error"
echo
LOG_TO_FILE "Node has had its software updated during init a second consecutive time. Skipping reboot due to possible error"
touch ${software_install_failed_file}
rm -f ${software_updated_during_init_file}
exit 1
fi
else
rm -f ${software_updated_during_init_file}
fi
}
function check_install_uuid {
# Check whether our installed load matches the active controller
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
@@ -158,7 +135,6 @@ case "$1" in
LOG_TO_FILE "***** Finished software operation *****"
fi
check_for_rr_software_update
;;
stop)
# Nothing to do here

View File

@@ -12,6 +12,7 @@ from datetime import datetime
from datetime import timezone
import json
import logging
from packaging import version
from pathlib import Path
import shutil
import subprocess
@@ -170,6 +171,11 @@ class VarSnapshot(LVMSnapshot):
deploy = content.get("deploy")
for d in deploy:
d["state"] = "host-rollback-done"
from_release = d["from_release"]
to_release = d["to_release"]
if version.Version(to_release) > version.Version(from_release):
d["from_release"] = to_release
d["to_release"] = from_release
with open(software_json, "w") as fp:
fp.write(json.dumps(content))
LOG.info("Deployment data updated")
@@ -375,6 +381,7 @@ def main():
manager.delete_snapshots()
elif args.list:
snapshots = [snapshot.to_json() for snapshot in manager.list_snapshots()]
success = bool(snapshots) # True is snapshots exists, False otherwise
print(json.dumps(snapshots, indent=4))
else:
parser.print_usage()

View File

@@ -498,19 +498,17 @@ def delete_older_deployments():
# Sample command and output that is parsed to get the list of
# deployment IDs
#
# Command: ostree admin status | grep debian
# Command: ostree admin status | egrep 'debian [a-z0-9]+'
#
# Output:
#
# * debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.2
# debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.1 (rollback)
# debian 3334dc80691a38c0ba6c519ec4b4b449f8420e98ac4d8bded3436ade56bb229d.0
LOG.info("Inside delete_older_deployments of ostree_utils")
cmd = "ostree admin status | grep debian"
# * debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.2
# debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.1 (rollback)
# debian 9a4d8040800f8cf9191ca3401f8006f3df5760b33d78f931309b5bb5db062ab3.0
cmd = "ostree admin status | egrep 'debian [a-z0-9]+'"
try:
output = subprocess.run(cmd, shell=True, check=True, capture_output=True)
output = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
msg = "Failed to fetch ostree admin status."
info_msg = "OSTree Admin Status Error: return code: %s , Output: %s" \
@@ -518,68 +516,34 @@ def delete_older_deployments():
LOG.info(info_msg)
raise OSTreeCommandFail(msg)
# Store the output of the above command in a string
output_string = output.stdout.decode('utf-8')
# Find the active deployment (which usually is the first, but there are exceptions)
# and once found attempt to delete deployments after it in the list, except the rollback
delete_deployments = False
deployments_to_delete = []
for index, deployment in enumerate(output.stdout.strip().split("\n")):
if delete_deployments and "rollback" not in deployment:
deployments_to_delete.append(index)
if "*" in deployment:
LOG.info("Active deployment %s: %s", index, deployment)
delete_deployments = True
# Parse the string to get the latest commit for the ostree
split_output_string = output_string.split()
deployment_id_list = []
for index, deployment_id in enumerate(split_output_string):
if deployment_id == "debian":
deployment_id_list.append(split_output_string[index + 1])
# After a reboot, the deployment ID at the 0th index of the list
# is always the active deployment and the deployment ID at the
# 1st index of the list is always the fallback deployment.
# We want to delete all deployments except the two mentioned above.
# This means we will undeploy all deployments starting from the
# 2nd index of deployment_id_list
deploys_amount = len(deployment_id_list)
if deploys_amount <= 2:
if not deployments_to_delete:
LOG.info("No older deployments to delete")
return
return True
for index in reversed(range(2, deploys_amount)):
for index in reversed(deployments_to_delete):
try:
cmd = "ostree admin undeploy %s" % index
output = subprocess.run(cmd, shell=True, check=True, capture_output=True)
info_log = "Deleted ostree deployment %s" % deployment_id_list[index]
output = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
info_log = "Deleted ostree deployment %s: %s" % (index, output.stdout)
LOG.info(info_log)
except subprocess.CalledProcessError as e:
msg = "Failed to undeploy ostree deployment %s." % deployment_id_list[index]
msg = "Failed to undeploy ostree deployment %s." % index
info_msg = "OSTree Undeploy Error: return code: %s , Output: %s" \
% (e.returncode, e.stderr.decode("utf-8"))
% (e.returncode, e.stderr)
LOG.info(info_msg)
raise OSTreeCommandFail(msg)
def undeploy_inactive_deployments():
"""
Remove deployments other than the current deployment,
i.e. deployments from index 1 to len(deployments) - 1,
in the reverse order, from the oldest to the newest
"""
cmd = ["ostree", "admin", "status"]
try:
output = subprocess.run(cmd, text=True, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
LOG.exception("Error getting ostree deployment list: %s" % e.stderr)
return False
success = True
pattern = r"debian [a-z0-9]+.[0-9]+"
deployments = re.findall(pattern, output.stdout)
# skip the first (active) deployment
for index, deployment in reversed(list(enumerate(deployments[1:], 1))):
commit_id = deployment.replace("debian ", "").split(".")[0]
cmd = ["ostree", "admin", "undeploy", str(index)]
try:
subprocess.run(cmd, check=True)
LOG.info("Removed deployment %s, commit-id %s" % (index, commit_id))
except subprocess.CalledProcessError as e:
LOG.exception("Error removing deployment %s, commit-id %s: %s" % (index, commit_id, e.stderr))
success = False
return success
return True
def checkout_latest_ostree_commit(patch_sw_version):

View File

@@ -375,7 +375,7 @@ class SoftwareMessageDeployDeleteCleanupReq(messages.PatchMessage):
success_remove_upgrade_flags = remove_major_release_deployment_flags()
# undeploy the from-release ostree deployment to free sysroot disk space
success_ostree_undeploy_from_release = ostree_utils.undeploy_inactive_deployments()
success_ostree_undeploy_from_release = ostree_utils.delete_older_deployments()
cleanup_results = [
(success_ostree_remote_cleanup, "cleaning temporary refs/remotes"),