From 044b6c050bb8bfa6755e9ac2825c76f1e5991fa8 Mon Sep 17 00:00:00 2001 From: Davlet Panech Date: Fri, 25 Jul 2025 16:30:58 -0400 Subject: [PATCH] archive-dir.sh: rewrite in Python Old script was slow and didn't preserve hardlinks within the source set. This script doesn't link files that are identical within the source set, ie same checksum & attributes, but different inode. It can only link such files to similar files from older builds. This deficiency will be addressed in a separate commit. TESTS =================== * Manually test various input directories, including: - a directory that contains each type of file (regular, devices, sockets, symlinks, etc) - old index files with spaces in file names * Given a build with a dozen or so historical builds, copied the "aptly" directory and compared timing and destination directory size before/after this patch: - old script: time=4m13s size=56.0G - new script: time=14s size=6.1G * Run a Jenkins build that rebuilds one package, and doesn't clean/rebuild the ISO. Make sure "archive-misc" works as expected. Change-Id: Ic8f8931c4143bc355db1ccbad56ed772c0f3081e Signed-off-by: Davlet Panech --- dockerfiles/coreutils/Dockerfile | 4 +- scripts/archive-misc.sh | 9 +- scripts/helpers/archive-dir.py | 826 +++++++++++++++++++++++++++++++ scripts/helpers/archive-dir.sh | 516 ------------------- scripts/lib/job_utils.sh | 2 +- 5 files changed, 834 insertions(+), 523 deletions(-) create mode 100755 scripts/helpers/archive-dir.py delete mode 100755 scripts/helpers/archive-dir.sh diff --git a/dockerfiles/coreutils/Dockerfile b/dockerfiles/coreutils/Dockerfile index 375e1e6..f541321 100644 --- a/dockerfiles/coreutils/Dockerfile +++ b/dockerfiles/coreutils/Dockerfile @@ -1,8 +1,8 @@ -FROM debian:11 +FROM debian:12 RUN apt-get update -y && \ apt-get upgrade -y && \ - apt-get install -y bsdextrautils parallel && \ + apt-get install -y bsdextrautils parallel python3 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/scripts/archive-misc.sh b/scripts/archive-misc.sh index abfa6f5..731af50 100755 --- a/scripts/archive-misc.sh +++ b/scripts/archive-misc.sh @@ -109,13 +109,14 @@ do_archive_dir() { tmp_dir="$BUILD_HOME/tmp/archive-misc" rm -rf "$tmp_dir/$id" mkdir -p "$tmp_dir/$id" - cp -a "$THIS_DIR/helpers/archive-dir.sh" "$tmp_dir/" + cp -a "$THIS_DIR/helpers/archive-dir.py" "$tmp_dir/" local archive_args=() if [[ "$spec_method" == "checksum-hardlink" ]] ; then local old_checksums_file_list="$tmp_dir/$id/old_checksums_file.list" local find_func=find_old_checksum_files__$id $find_func >"$old_checksums_file_list" - archive_args+=("--checksum-hardlink" "$old_checksums_file_list") + archive_args+=("--checksum-hardlink") + archive_args+=("--old-index-files-from=$old_checksums_file_list") local extra_checksums_file for extra_checksums_file in "$@" ; do print_regfile_name_if_exists "$extra_checksums_file" @@ -132,10 +133,10 @@ do_archive_dir() { local src_dir="$BUILD_HOME/$dir" local dst_dir="$BUILD_OUTPUT_HOME/$dir" maybe_run mkdir -p "$dst_dir" - safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.sh" \ + safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.py" \ "${archive_args[@]}" \ -j ${PARALLEL_CMD_JOBS:-1} \ - --output-checksums "$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \ + --output-checksums="$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \ "$src_dir" \ "$dst_dir" \ "$tmp_dir/$id" diff --git a/scripts/helpers/archive-dir.py b/scripts/helpers/archive-dir.py new file mode 100755 index 0000000..4efe9db --- /dev/null +++ b/scripts/helpers/archive-dir.py @@ -0,0 +1,826 @@ +#!/usr/bin/env python3 + +import sys +assert sys.version_info >= (3, 9), "Python >= 3.9 is required" + +HELP="""\ +Usage: archive-dir.py [...] + [...] + +Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files. + +Create the index file, DST_DIR/StxChecksums. + +With --checksum-hardlink, attempt to link identical files form older builds +instead of copying them. + + + -v,--verbose be verbose + + -j,--jobs=N perform various operations in parallel (default: 1) + + --owner=OWNER set destination files' owner, requires requires root + privileges. + + --group=GROUP set desintation files' group as specified; requires root + privileges, or current user must be a member of GROUP + + --checksum-hardlink + Hardlink destination files if possible. You must provide + one or more index files (StxChecksums) generated by older + builds. We will use the files with matching properties & + checksums to create hard links in DST_DIR. + + --old-index-files-from=OLD_INDEX_LIST_FILE + Read additional index file names from INDEX_LIST_FILE + + --reflink Create light-weight (COW) file copies if possible. This + only applies when copying (ie when no link candidates + found) + + --skip-existing Skip files that already exist at destination. We still need + to calculate thir checksums in order to create the index, + but we will skip the copy. + + --keep-temp-files + Normally we delete temporary files upon successful + completion, this option will keep them. + +""" + +# FIXME: this doesn't link files that are identical within the source set, +# ie same checksum & attributes, but different inode. It can only link +# such files to similar files from older builds. + +import argparse +from collections.abc import Iterable +from collections.abc import Callable +from dataclasses import dataclass +import grp +import hashlib +import itertools +from multiprocessing import Pool +import os +from pathlib import Path +import pwd +import re +import shutil +import shlex +import stat +import subprocess +from typing import TextIO + +JOBS = 1 +CHECKSUM_READ_SIZE = 4 * 1024 * 1024 # 4 MiB +COPY_REFLINK = False +OLD_INDEX_FILES = [] +SKIP_EXISTING = False +SRC_DIR = None +DST_DIR = None +TMP_DIR = None +CHANGE_UID = None +CHANGE_GID = None +VERBOSITY = 0 +CURRENT_GID_LIST = [] +OUTPUT_INDEX_FILE = None +KEEP_TEMP_FILES = False + +def log_error(msg:str)->None: + print('ERROR: %s' % msg, file=sys.stderr) + +def log_warn(msg:str)->None: + print('WARNING: %s' % msg, file=sys.stderr) + +def log_info(msg:str)->None: + print('%s' % msg, file=sys.stderr) + +def log_debug(msg:str)->None: + if VERBOSITY > 0: + print('%s' % msg, file=sys.stderr) + +def log_shell_cmd(cmd:str)->None: + if VERBOSITY > 0: + print('%% %s' % cmd, file=sys.stderr) + +# Apply func to items returned by an iterator in parallel. +# Returns an iterator with the results of func, in unpredictable +# order. +def map_p(func:Callable, it:Iterable)->Iterable: + pool = Pool(JOBS) + try: + for x in pool.imap_unordered(func, it): + yield x + pool.close() + pool.join() + except: + pool.terminate() + pool.join() + raise + pass + +# Remove a file if it exists. Raise an exception on directories. +def remove_file(filename:str)->None: + try: + os.unlink(filename) + except FileNotFoundError: + pass + +# Sort a file, ie replace it with a sorted version +def sort_file_inplace(filename:str, tmp_filename:str)->None: + cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', tmp_filename, filename ] + log_shell_cmd(shlex.join(cmd)) + subprocess.run(cmd, check=True) + log_debug('rename(%s,%s)' % (tmp_filename, filename)) + os.unlink(filename) + os.rename(tmp_filename, filename) + +# Combine old index files into one and sort it by checksum +# Output saved to TMP_DIR/old_index.list +def combine_old_index_files(): + if OLD_INDEX_FILES: + log_info('Combining old index files into one') + # Use 'awk' to add StxChecksums' base directory to each relative filename in it, + # for each input file, otherwise we won't be able to find the referenced file + # later when we read these entries. + # + # Pipe awk's output to sort + # + # ie: ( awk [...] StxChecksums_1 ; awk [...] StxChecksums_2 ; ... ) | sort [...] + + # Start the sort process, reading from STDIN + combined_index_file = os.path.join(TMP_DIR, 'old_index.list') + sort_cmd = [ 'sort', '--parallel=%s' % JOBS, '--output=%s' % combined_index_file ] + log_shell_cmd(shlex.join(sort_cmd)) + sort_proc = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE) + + # For each input file, execute AWK with its STDOUT set to sort's STDIN + try: + dst_dir_realpath = os.path.realpath(DST_DIR) + awk_expr = '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' + for old_index_file in OLD_INDEX_FILES: + try: + # Skip StxChecksums file that we are (re-)generating now + base_dir = os.path.realpath(os.path.dirname(old_index_file)) + if base_dir == dst_dir_realpath and os.path.basename(old_index_file) == 'StxChecksums': + log_warn('Ignoring output index file %s' % old_index_file) + continue + + # Input file may get deleted by job cleanup scripts from underneath us + # Open the file for reading and pass the open file descriptor to AWK + with open(old_index_file) as old_index_fh: + os.set_inheritable(old_index_fh.fileno(), True) + log_debug('fd %d = %s' % (old_index_fh.fileno(), old_index_file)) + awk_cmd = [ 'awk', '-v', 'DIR=%s/' % base_dir, awk_expr, '/dev/fd/%d' % old_index_fh.fileno() ] + log_shell_cmd(shlex.join(awk_cmd)) + subprocess.run(awk_cmd, stdout=sort_proc.stdin, check=True, close_fds=False) + except OSError as e: + # Ignore errors (typically ENOENT) -- fall back to copy elsewhere + subprocess.warn('Failed to process %s: %s' % old_index_file, str(e)) + continue + finally: + sort_proc.stdin.close() + sort_proc.wait() + if sort_proc.returncode != 0: + raise subprocess.CalledProcessError(returncode=sort_proc.returncode, cmd=sort_cmd) + + +# Format a line of StxChecksums file +def format_index_line(rel_path:str, orig_path:str, checksum:str, st:os.stat_result)->str: + return '%s %s %d %d %d %d %s' % (checksum, rel_path, st.st_size, st.st_mtime, st.st_dev, st.st_ino, orig_path) + +# File information for intermediate file lists +@dataclass +class FileInfo: + dev:int + ino:int + uid:int + gid:int + mode:int + size:int + mtime:float + checksum:str + rel_path:str + +# Create a FileInfo object from a stat record +def stat_to_file_info(st:os.stat_result, checksum:str, rel_path:str)->FileInfo: + return FileInfo(st.st_dev, st.st_ino, st.st_uid, st.st_gid, st.st_mode, st.st_size, st.st_mtime, checksum, rel_path) + +# Format a FileInfo record as a line of text +# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH +def format_file_info(fi:FileInfo)->str: + return '%d %d %d %d %d %d %f %s %s' % (fi.dev, fi.ino, fi.uid, fi.gid, fi.mode, fi.size, fi.mtime, fi.checksum, fi.rel_path) + +# Parse a line of text into a FileInfo object +# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH +RE_FILE_INFO = re.compile(r'^(\d+) (\d+) (\d+) (\d+) (\d+) (\d+) (\S+) (-|[0-9a-f]+) (.+)$', re.ASCII) +def parse_file_info(line:str)->FileInfo: + match = RE_FILE_INFO.match(line) + if match: + return FileInfo( + int(match.group(1)), # dev + int(match.group(2)), # ino + int(match.group(3)), # uid + int(match.group(4)), # gid + int(match.group(5)), # mode + int(match.group(6)), # size + float(match.group(7)), # mtime + match.group(8), # checksum + match.group(9), # rel_path + ) + return None + +# Read a list of FileInfo objects from a file +def read_file_info_lines(filename:str)->Iterable[FileInfo]: + with open(filename) as fh: + for line in fh: + fi = parse_file_info(line.rstrip('\n')) + if fi: + yield fi +# +# Find a hardlink candidate among the index (StxChecksums) files +# generated by older builds. +# Returns an iterator of tuples (old_path, stat_result), or None. +# +RE_OLD_FILE_INFO_LIST = [ + # Faster, but won't match filenames with spaces in them + re.compile(r'^([0-9a-f]+) (\S+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII), + # Slower (because of .+ in the middle) + re.compile(r'^([0-9a-f]+) (.+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII) +] +def find_old_files(checksum:str)->Iterable[tuple[str, os.stat_result]]: + # If there are no index files => no combined index either + if OLD_INDEX_FILES: + cmd = [ 'look', '%s ' % checksum, os.path.join(TMP_DIR, 'old_index.list') ] + log_shell_cmd(shlex.join(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding='utf8') + try: + for line in p.stdout: + line = line.rstrip('\n') + re_match_found = False + for regex in RE_OLD_FILE_INFO_LIST: + match = regex.match(line) + if match: + re_match_found = True + full_path = match.group(2) + size = int(match.group(3)) + mtime = int(match.group(4)) + try: + st = os.stat(full_path, follow_symlinks=False) + # NOTE: index files store time stamps as integer's (ie truncated) + if st.st_size == size and int(st.st_mtime) == mtime: + yield (full_path, st) + else: + log_debug('ignoring old index entry because its metadata doesn\'t match reality [%s] size=%d:%d mtime=%d:%d' % (line, size, st.st_size, mtime, int(st.st_mtime))) + except FileNotFoundError: + log_debug('ignoring old index entry because the referenced file doesn\'t exist: %s' % full_path) + except OSError as e: + log_warn('ignoring old index entry: %s: %s' % (full_path, str(e))) + if not re_match_found: + log_warn('Failed to parse (old) index line [%s]' % line) + finally: + p.stdout.close() + p.wait() + +# +# Search SRC_DIR and save the FileInfo entries to 3 files: +# dirs.list -- directores +# files.list -- non-directories with unique dev/ino +# links.list -- duplicate dev/inos +# +# All files will have the checksum field set to "-"; we will calculate +# the checksums separately for files.list. +# +# Returns a tuple with total counts. +# +def find_files()->tuple[int,int,int]: + + log_info("searching for files") + + dirs_file = os.path.join(TMP_DIR, 'dirs.list') + dirs_fh = None + dirs_count = 0 + + files_file = os.path.join(TMP_DIR, 'files.list') + files_fh = None + files_count = 0 + + links_file = os.path.join(TMP_DIR, 'links.list') + links_fh = None + links_count = 0 + + try: + log_debug('creating %s' % dirs_file) + dirs_fh = open(dirs_file, 'w') + + log_debug('creating %s' % files_file) + files_fh = open(files_file, 'w') + + log_debug('creating %s' % links_file) + links_fh = open(links_file, 'w') + + dev_map = {} + + def walk_error(err:Exception)->None: + raise err + + dirs_count = 0 + files_count = 0 + links_count = 0 + + log_debug(' %s/' % '.') + st = os.stat(SRC_DIR, follow_symlinks=False) + print('%s' % format_file_info(stat_to_file_info(st, '-', '.')), file=dirs_fh) + dirs_count += 1 + + for (dirpath, dirnames, filenames) in os.walk(SRC_DIR, onerror=walk_error): + rel_dirpath = dirpath[len(SRC_DIR)+1:] + extra_files = [] + + # directories + for dirname in dirnames: + full_path = os.path.join(dirpath, dirname) + st = os.stat(full_path, follow_symlinks=False) + # os.walk() returns directory symlinks as "directories" here. + # Treat them as any other non-directory file below + if stat.S_ISDIR(st.st_mode): + rel_path = os.path.join(rel_dirpath, dirname) + log_debug(' %s/' % rel_path) + print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=dirs_fh) + dirs_count += 1 + else: + extra_files.append(dirname) + + # files + for filename in itertools.chain.from_iterable([filenames, extra_files]): + rel_path = os.path.join(rel_dirpath, filename) + full_path = os.path.join(dirpath, filename) + log_debug (' %s' % rel_path) + st = os.stat(full_path, follow_symlinks=False) + ino_map = dev_map.get(st.st_dev) + if ino_map is None: + ino_map = {} + dev_map[st.st_dev] = ino_map + if st.st_ino not in ino_map: + ino_map[st.st_ino] = None + fh = files_fh + files_count += 1 + else: + fh = links_fh + links_count += 1 + print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=fh) + + finally: + for fh in (links_fh, files_fh, dirs_fh): + if fh is not None: + fh.close() + + # Sort files.list because we need to look up duplicate devno/ino entries + # there for creating links + sort_file_inplace(files_file, '%s.tmp' % files_file) + + log_info ('found dirs=%d files=%d links=%d' % (dirs_count, files_count, links_count)) + return (dirs_count, files_count, links_count) + +# Get the SHA256 of a file +def get_sha256(path:str)->str: + with open(path, "rb") as f: + file_hash = hashlib.sha256() + while chunk := f.read(CHECKSUM_READ_SIZE): + file_hash.update(chunk) + return file_hash.hexdigest() + +# Calculate and add the checksum given a FileInfo and return +# the updated FileInfo. Make no changes for non-regfiles. +def add_one_checksum(fi:FileInfo)->FileInfo: + if stat.S_ISREG(fi.mode): + src_path = os.path.join(SRC_DIR, fi.rel_path) + log_debug('sha256(%s)' % src_path) + fi.checksum = get_sha256(src_path) + return fi + +# +# Add checksums and sort files.list +# +def calc_checksums(files_count:int)->None: + log_info("calculating checksums, count=%d" % files_count) + + list_file = os.path.join(TMP_DIR, 'files.list') + tmp_list_file = os.path.join(TMP_DIR, 'files.list.tmp') + + log_debug('creating sorted %s' % tmp_list_file) + with open(tmp_list_file, 'w') as fh: + fi_iter = read_file_info_lines(list_file) + for fi in map_p(add_one_checksum, fi_iter): + print(format_file_info(fi), file=fh) + + cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', list_file, tmp_list_file ] + log_shell_cmd(shlex.join(cmd)) + subprocess.run(cmd, check=True) + os.unlink(tmp_list_file) + +# +# Create directores at destination +# +def create_dirs(dirs_count:int)->None: + log_info("creating directories, count=%d" % dirs_count) + for fi in read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list')): + if fi.rel_path == '.': + path = DST_DIR + else: + path = os.path.join(DST_DIR, fi.rel_path) + + dst_exists = False + try: + st = os.stat(path) + if stat.S_ISDIR(st.st_mode): + dst_exists = True + else: + remove_file(path) + except FileNotFoundError: + pass + + if not dst_exists: + log_debug('mkdir(%s)' % path) + os.mkdir(path) + + # If we are not root, set directory permissions to be + # writable by owner, because we will be creating files + # there. This will fail if destination directory is not + # already owned by us (to be expected). + if os.geteuid() != 0: + log_debug('chmod(%s, 0%o)' % (path, 0o700)) + # Don't set follow_symlinks because this function + # is never called for symlinks + os.chmod(path, 0o700) + +# Copy a file and its attributes, but change UID/GID as specified +def do_copy(src_path:str, dst_path:str, new_uid:int, new_gid:int)->None: + #log_debug("copy(%s, %s)" % (src_path, dst_path)) + cmd = [ 'cp', '-a' ] + if COPY_REFLINK: + cmd.append('--reflink') + cmd.append('--no-dereference') + cmd.append('--') + cmd.append(src_path) + cmd.append(dst_path) + log_shell_cmd(shlex.join(cmd)) + subprocess.run(cmd, check=True) + + # Doesn't support reflinks, see https://github.com/python/cpython/issues/81338 + #shutil.copy2(src_path, dst_path, follow_symlinks=False) + + st = os.stat(dst_path, follow_symlinks=False) + if new_gid != st.st_gid or new_uid != st.st_uid: + log_debug('chown(%s, %d, %d)' % (dst_path, new_uid, new_gid)) + os.chown(dst_path, new_uid, new_gid) + st = os.stat(dst_path, follow_symlinks=False) + + return st + +# +# Copy or link a regfile: +# If there's an older file with the same checksum, link it +# Otherwise copy it +# If linking fails, also copy it +# +# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED) +# +def copy_one_file(fi:FileInfo)->tuple: + dst_path = os.path.join(DST_DIR, fi.rel_path) + src_path = os.path.join(SRC_DIR, fi.rel_path) + + # Work out target file's UID/GID + if CHANGE_GID is not None: + new_gid = CHANGE_GID + else: + new_gid = fi.gid + if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST: + new_gid = os.getegid() + + if CHANGE_UID is not None: + new_uid = CHANGE_UID + else: + new_uid = fi.uid + if os.geteuid() != 0: + new_uid = os.geteuid() + + # Skip existing files + if SKIP_EXISTING: + try: + st = os.stat(dst_path, follow_symlinks=False) + if st.st_uid == new_uid and \ + st.st_gid == new_gid and \ + st.st_size == fi.size and \ + st.st_mtime == fi.mtime and \ + st.st_mode == fi.mode: + log_debug('skipping existing %s' % dst_path) + # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED) + return (fi.rel_path, dst_path, fi.checksum, st, 0, 0, 1) + except FileNotFoundError: + pass + + # Delete destination file if it exists + remove_file(dst_path) + + # Regular file: try to link it to a file from an older build + if stat.S_ISREG(fi.mode) and fi.checksum != '-': + + # Look up an identical file among the older builds + for (old_path, old_st) in find_old_files(fi.checksum): + try: + log_debug('found link candidate by checksum: %s' % old_path) + # Only link old files whose attributes match the source file + # except mtime + if old_st.st_uid == new_uid and \ + old_st.st_gid == new_gid and \ + old_st.st_size == fi.size and \ + old_st.st_mode == fi.mode: + log_debug('link(%s,%s)' % (old_path, dst_path)) + os.link(old_path, dst_path) + dst_stat = os.stat(dst_path, follow_symlinks=False) + # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED) + return (fi.rel_path, old_path, fi.checksum, dst_stat, 1, 0, 0) + break + except OSError as e: + log_warn('link(old_path,dst_path): %s' % str(e)) + + # Checksum not found, or link failed: copy + dst_stat = do_copy(src_path, dst_path, new_uid, new_gid) + # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED) + return (fi.rel_path, dst_path, fi.checksum, dst_stat, 0, 1, 0) + +# +# Copy files to DST_DIR +# +# Returns tuple (total_linked, total_copied, total_skipped) +# +def copy_files(files_count:int)->tuple[int,int]: + log_info("copying files, count=%d" % files_count) + total_linked = 0 + total_copied = 0 + total_skipped = 0 + with open(os.path.join(TMP_DIR, 'files.index'), 'w') as fh: + fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'files.list')) + for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_file, fi_iter): + total_linked += linked + total_copied += copied + total_skipped += skipped + if stat.S_ISREG(st.st_mode): + index_line = format_index_line(rel_path, full_path, checksum, st) + print('%s' % index_line, file = fh) + + return (total_linked, total_copied, total_skipped) + +# +# Re-create a hardlink at destination, ie create a file +# as a link to a previously copied file, because it was +# linked in SRC_DIR. +# +# Fall back to copy if link fails. +# +# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED) +# +def copy_one_link(fi:FileInfo)->tuple: + dst_path = os.path.join(DST_DIR, fi.rel_path) + src_path = os.path.join(SRC_DIR, fi.rel_path) + + # Delete destination file if it exists + remove_file(dst_path) + + # Try to link it to a file we previously installed in copy_files() + # Find the previously-installed source file in files.list, by dev/ino + try: + cmd = [ 'look', '%d %d ' % (fi.dev, fi.ino), os.path.join(TMP_DIR, 'files.list') ] + log_shell_cmd(shlex.join(cmd)) + cmd_res = subprocess.run(cmd, check=False, encoding='utf8', stdout=subprocess.PIPE).stdout + old_fi = parse_file_info(cmd_res) + if old_fi: + orig_path = os.path.join(DST_DIR, old_fi.rel_path) + log_debug('link(%s,%s)' % (orig_path, dst_path)) + os.link(orig_path, dst_path) + st = os.stat(dst_path, follow_symlinks=False) + # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED) + return (fi.rel_path, orig_path, old_fi.checksum, st, 1, 0, 0) + except OSError as e: + log_warn('failed to link %s: %s' % (dst_path, str(e))) + + # Fall back to copy + return copy_one_file(fi) + +# +# Re-create or copy hardlinks at destination +# +# Returns tuple (total_linked, total_copied, total_skipped) +# +def copy_links(links_count:int)->tuple[int,int]: + log_info("copying links, count=%d" % links_count) + total_linked = 0 + total_copied = 0 + total_skipped = 0 + with open(os.path.join(TMP_DIR, 'links.index'), 'w') as fh: + fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'links.list')) + for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_link, fi_iter): + total_linked += linked + total_copied += copied + total_skipped += skipped + if stat.S_ISREG(st.st_mode): + index_line = format_index_line(rel_path, full_path, checksum, st) + print('%s' % index_line, file = fh) + return (total_linked, total_copied, total_skipped) + +# +# Set directory permissions & ownership to how they were at the source +# +def adjust_one_dir_perms(fi:FileInfo)->FileInfo: + path = os.path.join(DST_DIR, fi.rel_path) + perms = stat.S_IMODE(fi.mode) + log_debug("chmod(%s, 0%o)" % (path, perms)) + # Don't set follow_symlinks because this function + # is never called for symlinks + os.chmod(path, perms) + + # At this point target directory exists and is owned + # by the current UID:GID due to create_dirs(). + st = os.stat(path, follow_symlinks=False) + + if CHANGE_GID is not None: + new_gid = CHANGE_GID + else: + new_gid = fi.gid + if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST: + new_gid = os.getegid() + + if CHANGE_UID is not None: + new_uid = CHANGE_UID + else: + new_uid = fi.uid + if os.geteuid() != 0: + new_uid = os.geteuid() + + if new_uid != st.st_uid or new_gid != st.st_gid: + log_debug("chown(%s, %d, %d)" % (path, new_uid, new_gid)) + os.chown(path, new_uid, new_gid, follow_symlinks=False) + + # Set both access time and modification time to modification fime of the + # source directory + log_debug("utime(%s, (%f, %f))" % (path, fi.mtime, fi.mtime)) + os.utime(path, (fi.mtime, fi.mtime)) + + return fi + +# +# Adjust directory permissions & ownership at destination +# +def adjust_dir_perms(dirs_count:int)->None: + log_info("adjusting directory permissions, count=%d" % dirs_count) + fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list')) + for fi in map_p(adjust_one_dir_perms, fi_iter): + pass + +# Save or print "standard" index (StxChecksums) for regfiles and links +def save_index(files_count:int, links_count:int)->None: + files_index_file = os.path.join(TMP_DIR, 'files.index') + links_index_file = os.path.join(TMP_DIR, 'links.index') + full_index_file = os.path.join(DST_DIR, 'StxChecksums') + log_info('creating index, count=%d' % (files_count + links_count)) + + sort_cmd = [ 'sort', '--parallel=%d' % JOBS, '--output=%s' % full_index_file, files_index_file, links_index_file ] + + log_shell_cmd(shlex.join(sort_cmd)) + subprocess.run(sort_cmd, check=True) + +# Delete temp files +def cleanup(): + if not KEEP_TEMP_FILES: + tmp_files = [ + 'dirs.list', + 'files.index', + 'files.list', + 'links.index', + 'links.list', + 'old_index.list', + ] + for file in tmp_files: + remove_file(os.path.join(TMP_DIR, file)) + +# process command line +def init()->None: + def positive_integer(s:str)->int: + v = int(s) + if v < 1: + raise ValueError() + return v + def user_id(s:str)->int: + try: + uid = int(s) + except: + try: + uid = pwd.getpwnam(s).pw_uid + except: + raise ValueError() + if uid < 0: + raise ValueError + return uid + def group_id(s:str)->int: + try: + uid = int(s) + except: + try: + uid = grp.getgrnam(s).gr_gid + except: + raise ValueError() + if uid < 0: + raise ValueError + return uid + + p = argparse.ArgumentParser() + p.add_argument('-j', '--jobs', type=positive_integer, default=1) + p.add_argument('--owner', type=user_id) + p.add_argument('--group', type=group_id) + p.add_argument('--checksum-hardlink', action='store_true', default=False) + p.add_argument('--old-index-files-from') + p.add_argument('--output-checksums') + p.add_argument('--skip-existing', action='store_true', default=False) + p.add_argument('-v', '--verbose', action='count', default=0, dest='verbosity') + p.add_argument('--reflink', action='store_true', default=False) + p.add_argument('--keep-temp-files', action='store_true', default=False) + p.add_argument('SRC_DIR') + p.add_argument('DST_DIR') + p.add_argument('TMP_DIR') + p.add_argument('old_index_files', nargs='*') + p.format_help = lambda: HELP + args = p.parse_args() + + current_gid_list = [ os.getegid(), *os.getgroups() ] + if args.owner is not None: + if os.geteuid() != 0 and args.owner != os.geteuid(): + log_error('--owner can only be changed by root') + sys.exit(1) + if args.group is not None: + if os.geteuid() != 0 and args.group not in current_gid_list: + log_error('--group can only be changed by root; or it must be a group you are a member of') + sys.exit(1) + + existing_old_index_files = [] + if args.checksum_hardlink: + old_index_files = [] + old_index_files += args.old_index_files + if args.old_index_files_from: + with open(args.old_index_files_from) as fh: + for filename in fh: + filename = filename.rstrip() + old_index_files.append(filename) + # Ignore missing/non-readable files because they may disappear + # while this script is running + for filename in old_index_files: + try: + with open(filename) as ref_fh: + existing_old_index_files.append(filename) + except OSError as x: + log_warn('Ignoring index file %s: %s' % (filename, str(x))) + elif args.old_index_files: + log_warn('old index files are meaningless without --checksum-hardlink') + + global JOBS, CHANGE_UID, CHANGE_GID, CURRENT_GID_LIST + global VERBOSITY, COPY_REFLINK, SRC_DIR, DST_DIR, TMP_DIR + global OLD_INDEX_FILES, OUTPUT_INDEX_FILE + global KEEP_TEMP_FILES, SKIP_EXISTING + JOBS = args.jobs + CHANGE_UID = args.owner + CHANGE_GID = args.group + CURRENT_GID_LIST = current_gid_list + VERBOSITY = args.verbosity + COPY_REFLINK = args.reflink + SRC_DIR = str(Path(args.SRC_DIR).absolute()) + DST_DIR = str(Path(args.DST_DIR).absolute()) + TMP_DIR = str(Path(args.TMP_DIR).absolute()) + OLD_INDEX_FILES = existing_old_index_files + OUTPUT_INDEX_FILE = args.output_checksums + SKIP_EXISTING = args.skip_existing + KEEP_TEMP_FILES = args.keep_temp_files + +init() + +log_debug('SRC_DIR=%s' % SRC_DIR) +log_debug('DST_DIR=%s' % DST_DIR) +log_debug('TMP_DIR=%s' % TMP_DIR) +log_debug('JOBS=%d' % JOBS) +if CHANGE_UID: + log_debug('CHANGE_UID=%d' % CHANGE_UID) +if CHANGE_GID: + log_debug('CHANGE_GID=%d' % CHANGE_GID) +log_debug('OLD_INDEX_FILES=%s' % OLD_INDEX_FILES) +log_debug('KEEP_TEMP_FILES=%d' % KEEP_TEMP_FILES) + +if not os.path.isdir(TMP_DIR): + os.mkdir(TMP_DIR) +(dirs_count, files_count, links_count) = find_files() +calc_checksums(files_count) +create_dirs(dirs_count) +combine_old_index_files() # DST_DIR must already exist +(linked1, copied1, skipped1) = copy_files(files_count) +(linked2, copied2, skipped2) = copy_links(links_count) +adjust_dir_perms(dirs_count) +save_index(files_count, links_count) +cleanup() +log_info('%s linked=%d copied=%d skipped=%d' % (DST_DIR, linked1+linked2, copied1+copied2, skipped1+skipped2)) diff --git a/scripts/helpers/archive-dir.sh b/scripts/helpers/archive-dir.sh deleted file mode 100755 index 6d19506..0000000 --- a/scripts/helpers/archive-dir.sh +++ /dev/null @@ -1,516 +0,0 @@ -#!/bin/bash - -PROGNAME="${BASH_SOURCE[0]##*/}" -SRC_DIR= -DST_DIR= -CHECKSUM_FILES_LIST_FILE= -DST_CHECKSUMS_FILE= -CHANGE_OWNER= -CHANGE_GROUP= -JOBS=1 -XTRACE=0 - -usage() { - echo -n "\ -Usage: $0 [OPTIONS...] SRC_DIR DST_DIR TMP_DIR - -Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files. - - -j,--jobs=N calculate checksums in parallel (default: 1) - --owner=OWNER set copied file's owner as specified - --group=GROUP set copied file's group as specified - - --output-checksums=CK_FILE - save StxChecksums to this file; by default print it to - STDOUT - - --checksum-hardlink=CK_LIST_FILE - Hardlink destination files if possible. CK_LIST_FILE - must contain a list of existing StxChecksums file names - from previously-archived directories, one per line. - We will use the files with matching properties & checksums - to create hard links in DST_DIR. - - --xtrace Enable debug output - -If executed by root, we will preserve owners/groups of the copied files, -unless they are overridden on the command line. - -If this script is called by non-root, it will create all files with the -calling user's effective user & group ownership. - -" - exit 0 -} - -cmdline_error() { - if [[ "$#" -gt 0 ]] ; then - echo "ERROR:" "$@" >&2; - fi - echo "Type \`$0 --help' for more info" >&2 - exit 1 -} - -check_pipe_status() { - local -a pipestatus=(${PIPESTATUS[*]}) - local -i i - for ((i=0; i<${#pipestatus[*]}; ++i)) ; do - [[ "${pipestatus[$i]}" -eq 0 ]] || return 1 - done - return 0 -} - -# Process command line -temp=$(getopt -o h,j: --long help,jobs:,owner:,group:,output-checksums:,checksum-hardlink:,xtrace -n "$PROGNAME" -- "$@") || cmdline_error -eval set -- "$temp" -while [[ "$#" -gt 0 ]] ; do - case "$1" in - -h|--help) - usage - exit 0 - ;; - -j|--jobs) - JOBS="$2" - if [[ ! "$JOBS" =~ ^[0-9]{1,2}$ || "$JOBS" -le 0 || "$JOBS" -ge 99 ]] ; then - cmdline_error "$1 must be an integer [1.99]" - fi - shift 2 - ;; - --owner) - CHANGE_OWNER="$2" - shift 2 - ;; - --group) - CHANGE_GROUP="$2" - shift 2 - ;; - --checksum-hardlink) - CHECKSUM_FILES_LIST_FILE="$2" - shift 2 - ;; - --output-checksums) - DST_CHECKSUMS_FILE="$2" - shift 2 - ;; - --xtrace) - XTRACE=1 - shift - ;; - --) - shift - break - ;; - *) - cmdline_error - ;; - esac -done -[[ "$#" -ge 3 ]] || cmdline_error "not enough arguments" -[[ "$#" -le 3 ]] || cmdline_error "too many arguments" -SRC_DIR="$1" -DST_DIR="$2" -TMP_DIR="$3" - -if [[ ! "$EGID" ]] ; then - EGID="$(id -g)" || exit 1 -fi - -if [[ $XTRACE -eq 1 ]] ; then - set -x -fi - -# Make sure BSD look is installed -if ! look --help >/dev/null ; then - echo "This script requires \"look\" to be installed" >&2 - exit 1 -fi - -# Check for GNU parallel -if parallel --help >/dev/null 2>&1 ; then - GNU_PARALLEL_EXISTS=1 -else - GNU_PARALLEL_EXISTS=0 -fi - -set -e - -# -# Combine checksum list files into one -# -if [[ "$CHECKSUM_FILES_LIST_FILE" ]] ; then - echo $'\n## Combining checksum lists into one' >&2 - combined_checksums_file="$TMP_DIR/combined_checksums.list" - while read -r checksums_file ; do - # skip empty lines and comments - if echo "$checksums_file" | grep -E '^\s*(#.*)$' ; then - continue - fi - # skip missing files - [[ -f "$checksums_file" ]] || continue - # add file path to the second token (file name) - checksums_dir="$(dirname "$checksums_file")" - awk -v "DIR=$checksums_dir/" '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' \ - "$checksums_file" - done <"$CHECKSUM_FILES_LIST_FILE" | sort >"$combined_checksums_file" - check_pipe_status -fi - -# -# Create source file lists -# - -# Cretate a list file with each source file or dir + their stat properties -echo $'\n## Compiling file list: '"$SRC_DIR" >&2 -full_list_file="$TMP_DIR/full.list" -( cd "$SRC_DIR" && find -printf 'type=%y owner=%U group=%G mode=%#m size=%s mtime=%T@ name=%p\n' ) \ - | sed 's#name=[.]/#name=#' \ - | sed 's#\(mtime=[0-9]\+\)[.][0-9]\+#\1#g' \ - >"${full_list_file}" -check_pipe_status - -# Create another list file that contains only regular files -regfile_list_file="$TMP_DIR/regfile.list" -\grep '^type=f' "$full_list_file" | sort -k 7 >"$regfile_list_file" || exit 1 - -# Create a list file that contains only directories -# Sort by the last field "name=..." -dir_list_file="$TMP_DIR/dir.list" -\grep '^type=d' "$full_list_file" | sort -k 7 >"$dir_list_file" || exit 1 - -# Create a list file that contains all other entries (non-dirs & non-files) -other_list_file="$TMP_DIR/other.list" -\grep '^type=[^df]' "$full_list_file" | sort -k 7 >"$other_list_file" || exit 1 - - -# -# Usage: process_lines MESSAGE INPUT_FILE FUNC ARGS... -# -# Call shell function FUNC in parallel, similar to xargs. -# We will read lines from INPUT_FILE, then pass some subset of lines -# to FUNC many times in parallel, until all lines have been processed. -# Input lines will be appended as additional arguments to FUNC calls. -# -# FUNC and any global vars it references must be exported before -# calling process_lines(). -# -# MESSAGE will be printed to STDERR before starting -# -process_lines() { - - local message="$1" ; shift - local input_file="$1" ; shift - - # how many input lines? bail out if 0 - local line_count - line_count="$(cat "$input_file" | wc -l)" || exit 1 - [[ "$line_count" -gt 0 ]] || return 0 - - # How many lines to process at a time. The more the better, but with too - # many some child jobs may starve -- cap it at 256 - local lines_per_job - if [[ "$JOBS" -gt 1 ]] ; then - let lines_per_job="line_count / JOBS / 2" - if [[ "$lines_per_job" -eq 0 ]] ; then - lines_per_job=1 - elif [[ "$lines_per_job" -gt 256 ]] ; then - lines_per_job=256 - fi - else - lines_per_job=256 - fi - - echo "** $message [JOBS=$JOBS lines_per_job=$lines_per_job]" >&2 - - # Prefer GNU parallel because it can exit early - local -a cmd - if [[ $GNU_PARALLEL_EXISTS -eq 1 ]] ; then - cmd=(parallel --halt now,fail=1 -q -r -d '\n' -n $lines_per_job -P $JOBS "$@") - else - cmd=(xargs -r -d '\n' -n $lines_per_job -P $JOBS $SHELL -c '"$@"' unused_arg "$@") - fi - if ! "${cmd[@]}" <"$input_file" ; then - echo "ERROR: command failed (\"$message\")" >&2 - return 1 - fi -} - - -# -# create directories in sort order, ie create parents before -# children -# -echo $'\n## Creating directories: '"$DST_DIR" >&2 -while read -r line ; do - [[ -n "$line" ]] || continue - name="${line#* name=}" - [[ -n "$name" ]] || continue - attr_line="${line% name=*}" - mode="$(echo "$attr_line" | sed -n -r 's#.*mode=([0-9]+).*#\1#p')" - install_args=() - if [[ "$CHANGE_OWNER" ]] ; then - install_args+=("--owner" "$CHANGE_OWNER") - elif [[ $EUID -eq 0 ]] ; then - owner="$(echo "$attr_line" | sed -n -r 's#.*owner=([0-9]+).*#\1#p')" - install_args+=("--owner" "$owner") - fi - if [[ "$CHANGE_GROUP" ]] ; then - install_args+=("--group" "$CHANGE_GROUP") - elif [[ $EUID -eq 0 ]] ; then - group="$(echo "$attr_line" | sed -n -r 's#.*group=([0-9]+).*#\1#p')" - install_args+=("--group" "$group") - fi - echo " MKDIR $name" >&2 - if [[ -e "$DST_DIR/$name" && ! -d "$DST_DIR/$name" ]] ; then - \rm "$DST_DIR/$name" || exit 1 - fi - install -d "${install_args[@]}" "$DST_DIR/$name" -done <"$dir_list_file" || exit 1 - -# -# Copy or hardlink regular files -# -echo $'\n## Copying regular files: '"$SRC_DIR" >&2 - -# helper function to process regular files -# global vars used: -# SRC_DIR -# DST_DIR -# CHANGE_OWNER -# CHANGE_GROUP -# EUID (always definedby bash) -# EGID -# TMP_DIR -# XTRACE -# combined_checksums_file -process_regfiles() { - if [[ $XTRACE -eq 1 ]] ; then - set -x - fi - - # Temp file generated by this function. Its name must be unique to - # prevent interference from other jobs with -j N. - local matching_checksums_file - matching_checksums_file="$TMP_DIR/matching_checksums-$$.list" - - local line attr_line - for line in "$@" ; do - - # source file name relative to SRC_DIR - local name - name="${line#* name=}" - [[ "$name" ]] || continue - - # all attributes leading up to name= - attr_line="${line% name=*}" - - # source checksum - local checksum - #flock -s "$DST_DIR" echo " SHA256 $name" >&2 - checksum="$(sha256sum "$SRC_DIR/$name" | awk '{print $1}')" - if [[ ! "$checksum" ]] ; then - flock -s "$DST_DIR" echo "$SRC_DIR/$name: failed to calculate checksum" >&2 - return 1 - fi - - # source owner; or a user-provided override - local -a install_args=() - local owner - if [[ "$CHANGE_OWNER" ]] ; then - owner="$CHANGE_OWNER" - install_args+=("--owner" "$owner") - elif [[ $EUID -eq 0 ]] ; then - owner="$(echo "$attr_line" | sed -n -r 's#.* owner=([0-9]+).*#\1#p')" - install_args+=("--owner" "$owner") - else - owner=$EUID - fi - - # source group; or a user-provided override - local group - if [[ "$CHANGE_GROUP" ]] ; then - group="$CHANGE_GROUP" - install_args+=("--group" "$group") - elif [[ $EGID -eq 0 ]] ; then - group="$(echo "$attr_line" | sed -n -r 's#.* group=([0-9]+).*#\1#p')" - install_args+=("--group" "$group") - else - group=$EGID - fi - - # source file's mode/permissions - local mode - mode="$(echo "$attr_line" | sed -n -r 's#.* mode=([^[:space:]]+).*#\1#p')" - - # Search for the checksum in an older StxChecksums file - if [[ "$combined_checksums_file" ]] ; then - if look "$checksum " "$combined_checksums_file" >"$matching_checksums_file" 2>/dev/null ; then - ( - # As we read previosuly-archived files properties from StxChecksums, - # make sure they have not changed compared to the actual files on disk. - while read -r ref_checksum ref_name ref_size ref_mtime ref_dev ref_inode ref_path x_rest ; do - [[ -f "$ref_path" ]] || continue - # read on-disk file properties - local ref_stat - ref_stat=($(stat -c '%s %Y %u %g %#04a' "$ref_path" || true)) - [[ "${#ref_stat[@]}" -eq 5 ]] || continue - - # on-disk size does not match StxChecksums - local ref_ondisk_size - ref_ondisk_size="${ref_stat[0]}" - [[ "$ref_size" == "$ref_ondisk_size" ]] || continue - - # on-disk mtime does not match StxChecksums - local ref_ondisk_mtime - ref_ondisk_mtime="${ref_stat[1]}" - [[ "${ref_mtime}" == "$ref_ondisk_mtime" ]] || continue - - # on-disk owner does not match requested owner - local ref_ondisk_owner - ref_ondisk_owner="${ref_stat[2]}" - [[ "${owner}" == "$ref_ondisk_owner" ]] || continue - - # on-disk group does not match requested group - local ref_ondisk_group - ref_ondisk_group="${ref_stat[3]}" - [[ "${group}" == "$ref_ondisk_group" ]] || continue - - # on-disk mode does not match the mode of the source file - ref_ondisk_mode="${ref_stat[4]}" - [[ "${mode}" == "$ref_ondisk_mode" ]] || continue - - # At this point checksum, size, mtime, mode, owner, group and checksums of the - # exsiting file match with the file we are trying to copy. - # Use that file to create a hardlink. - flock -s "$DST_DIR" echo " LINK $name (from $ref_name)" >&2 - if ln -f "$ref_name" "${DST_DIR}/$name" ; then - flock -s "$DST_DIR" echo "$checksum $name $ref_size $ref_mtime $ref_dev $ref_inode $DST_DIR/$name" - exit 0 - fi - done <"$matching_checksums_file" - # checksum not found in older archives - exit 1 - ) && continue || true - fi - fi - - # No matching files found: really copy it - - if [[ -e "$DST_DIR/$name" ]] ; then - \rm "$DST_DIR/$name" || exit 1 - fi - - # source file's size & mtime - local size mtime - size="$(echo "$attr_line" | sed -n -r 's#.* size=([^[:space:]]+).*#\1#p')" - mtime="$(echo "$attr_line" | sed -n -r 's#.* mtime=([^[:space:]]+).*#\1#p')" - - # copy it to $DST_DIR - flock -s "$DST_DIR" echo " COPY $name" >&2 - rm -f "$DST_DIR/$name" || exit 1 - install --preserve-timestamps "${install_args[@]}" --mode="$mode" -T "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1 - - # check destination file properties - local dst_stat dst_size dst_dev dst_ino - dst_stat=($(stat -c '%s %d %i' "$DST_DIR/$name")) || exit 1 - dst_size="${dst_stat[0]}" - dst_dev="${dst_stat[1]}" - dst_ino="${dst_stat[2]}" - - # file changed while copying - if [[ "$dst_size" != "$size" ]] ; then - flock -s "$DST_DIR" echo "ERROR: $SRC_DIR/$name changed while copying!" >&2 - exit 1 - fi - - # print out a line for StxChecksums using source file properties (preserved - # during copying), but with destination file's dev & ino. - flock -s "$DST_DIR" echo "$checksum $name $size $mtime $dst_dev $dst_ino $DST_DIR/$name" - done - - rm -f "$matching_checksums_file" -} - -# process files in parallel -( - if [[ "$DST_CHECKSUMS_FILE" ]] ; then - dst_checksums_fd=5 - exec 5<>"$DST_CHECKSUMS_FILE" || exit 1 - else - dst_checksums_fd=1 - fi - - export SRC_DIR \ - DST_DIR \ - CHANGE_OWNER \ - CHANGE_GROUP \ - EGID \ - TMP_DIR \ - XTRACE \ - combined_checksums_file - - export -f process_regfiles - - message="processing regular files" - process_lines "$message" "$regfile_list_file" process_regfiles | sort >&$dst_checksums_fd - [[ "${PIPESTATUS[0]}" -eq 0 && "${PIPESTATUS[1]}" -eq 0 ]] || exit 1 -) || exit 1 - - -# -# copy special files -# -echo $'\n## Copying special files: '"$DST_DIR" >&2 - -# helper function for processing special files -# global vars used: -# SRC_DIR -# DST_DIR -# CHANGE_OWNER -# CHANGE_GROUP -# XTRACE -process_other() { - if [[ $XTRACE -eq 1 ]] ; then - set -x - fi - local line attr_line - for line in "$@" ; do - local name - name="${line#* name=}" - [[ -n "$name" ]] || continue - attr_line="${line% name=*}" - - local type - type="$(echo "$attr_line" | sed 's#^type=\(.\) .*#\1#g')" - [[ -n "$type" ]] || continue - - flock -s "$DST_DIR" echo " CREATE type=$type $name" >&2 - if [[ -e "$DST_DIR/$name" ]] ; then - rm "$DST_DIR/$name" || exit 1 - fi - cp -a --no-dereference "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1 - if [[ "$CHANGE_OWNER" || "$CHANGE_GROUP" ]] ; then - local chown_arg= - if [[ "$CHANGE_OWNER" ]] ; then - chown_arg="$CHANGE_OWNER" - fi - if [[ "$CHANGE_GROUP" ]] ; then - chown_arg+=":$CHANGE_GROUP" - fi - chown --no-dereference "$chown_arg" "$DST_DIR/$name" || exit 1 - fi - done -} - -# process them in parallel -( - export SRC_DIR \ - DST_DIR \ - CHANGE_OWNER \ - CHANGE_GROUP \ - XTRACE - - export -f process_other - - message="processing other files" - process_lines "$message" "$other_list_file" process_other || exit 1 -) || exit 1 diff --git a/scripts/lib/job_utils.sh b/scripts/lib/job_utils.sh index 7544fb7..4e86188 100644 --- a/scripts/lib/job_utils.sh +++ b/scripts/lib/job_utils.sh @@ -35,7 +35,7 @@ export REPO_TRACE=0 # docker images SAFE_RSYNC_DOCKER_IMG="servercontainers/rsync:3.1.3" -COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20230529" +COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20250709" notice() { ( set +x ; print_log -i --notice "$@" ; )