archive-dir.sh: rewrite in Python

Old script was slow and didn't preserve hardlinks within the source set. This script doesn't link files that are identical within the source set, ie same checksum & attributes, but different inode. It can only link such files to similar files from older builds. This deficiency will be addressed in a separate commit. TESTS =================== * Manually test various input directories, including: - a directory that contains each type of file (regular, devices, sockets, symlinks, etc) - old index files with spaces in file names * Given a build with a dozen or so historical builds, copied the "aptly" directory and compared timing and destination directory size before/after this patch: - old script: time=4m13s size=56.0G - new script: time=14s size=6.1G * Run a Jenkins build that rebuilds one package, and doesn't clean/rebuild the ISO. Make sure "archive-misc" works as expected. Change-Id: Ic8f8931c4143bc355db1ccbad56ed772c0f3081e Signed-off-by: Davlet Panech <davlet.panech@windriver.com>
2025-07-25 16:30:58 -04:00
parent 47f01f5952
commit 044b6c050b
5 changed files with 834 additions and 523 deletions
--- a/dockerfiles/coreutils/Dockerfile
+++ b/dockerfiles/coreutils/Dockerfile
@@ -1,8 +1,8 @@
-FROM debian:11
+FROM debian:12

 RUN apt-get update -y && \
    apt-get upgrade -y && \
-    apt-get install -y bsdextrautils parallel && \
+    apt-get install -y bsdextrautils parallel python3 && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

--- a/scripts/archive-misc.sh
+++ b/scripts/archive-misc.sh
@@ -109,13 +109,14 @@ do_archive_dir() {
                tmp_dir="$BUILD_HOME/tmp/archive-misc"
                rm -rf "$tmp_dir/$id"
                mkdir -p "$tmp_dir/$id"
-                cp -a "$THIS_DIR/helpers/archive-dir.sh" "$tmp_dir/"
+                cp -a "$THIS_DIR/helpers/archive-dir.py" "$tmp_dir/"
                local archive_args=()
                if [[ "$spec_method" == "checksum-hardlink" ]] ; then
                    local old_checksums_file_list="$tmp_dir/$id/old_checksums_file.list"
                    local find_func=find_old_checksum_files__$id
                    $find_func >"$old_checksums_file_list"
-                    archive_args+=("--checksum-hardlink" "$old_checksums_file_list")
+                    archive_args+=("--checksum-hardlink")
+                    archive_args+=("--old-index-files-from=$old_checksums_file_list")
                    local extra_checksums_file
                    for extra_checksums_file in "$@" ; do
                        print_regfile_name_if_exists "$extra_checksums_file"
@@ -132,10 +133,10 @@ do_archive_dir() {
                local src_dir="$BUILD_HOME/$dir"
                local dst_dir="$BUILD_OUTPUT_HOME/$dir"
                maybe_run mkdir -p "$dst_dir"
-                safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.sh" \
+                safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.py" \
                    "${archive_args[@]}" \
                    -j ${PARALLEL_CMD_JOBS:-1} \
-                    --output-checksums "$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \
+                    --output-checksums="$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \
                    "$src_dir" \
                    "$dst_dir" \
                    "$tmp_dir/$id"
--- a/scripts/helpers/archive-dir.py
+++ b/scripts/helpers/archive-dir.py
@@ -0,0 +1,826 @@
+#!/usr/bin/env python3
+
+import sys
+assert sys.version_info >= (3, 9), "Python >= 3.9 is required"
+
+HELP="""\
+Usage: archive-dir.py [<OPTIONS>...] <SRC_DIR> <DST_DIR> <TMP_DIR>
+                                     [<OLD_INDEX_FILES>...]
+
+Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
+
+Create the index file, DST_DIR/StxChecksums.
+
+With --checksum-hardlink, attempt to link identical files form older builds
+instead of copying them.
+
+
+ -v,--verbose       be verbose
+
+ -j,--jobs=N        perform various operations in parallel (default: 1)
+
+    --owner=OWNER   set destination files' owner, requires requires root
+                    privileges.
+
+    --group=GROUP   set desintation files' group as specified; requires root
+                    privileges, or current user must be a member of GROUP
+
+    --checksum-hardlink
+                    Hardlink destination files if possible. You must provide
+                    one or more index files (StxChecksums) generated by older
+                    builds. We will use the files with matching properties &
+                    checksums to create hard links in DST_DIR.
+
+    --old-index-files-from=OLD_INDEX_LIST_FILE
+                    Read additional index file names from INDEX_LIST_FILE
+
+    --reflink       Create light-weight (COW) file copies if possible. This
+                    only applies when copying (ie when no link candidates
+                    found)
+
+    --skip-existing Skip files that already exist at destination. We still need
+                    to calculate thir checksums in order to create the index,
+                    but we will skip the copy.
+
+    --keep-temp-files
+                    Normally we delete temporary files upon successful
+                    completion, this option will keep them.
+
+"""
+
+# FIXME: this doesn't link files that are identical within the source set,
+# ie same checksum & attributes, but different inode. It can only link
+# such files to similar files from older builds.
+
+import argparse
+from collections.abc import Iterable
+from collections.abc import Callable
+from dataclasses import dataclass
+import grp
+import hashlib
+import itertools
+from multiprocessing import Pool
+import os
+from pathlib import Path
+import pwd
+import re
+import shutil
+import shlex
+import stat
+import subprocess
+from typing import TextIO
+
+JOBS = 1
+CHECKSUM_READ_SIZE = 4 * 1024 * 1024      # 4 MiB
+COPY_REFLINK = False
+OLD_INDEX_FILES = []
+SKIP_EXISTING = False
+SRC_DIR = None
+DST_DIR = None
+TMP_DIR = None
+CHANGE_UID = None
+CHANGE_GID = None
+VERBOSITY = 0
+CURRENT_GID_LIST = []
+OUTPUT_INDEX_FILE = None
+KEEP_TEMP_FILES = False
+
+def log_error(msg:str)->None:
+    print('ERROR: %s' % msg, file=sys.stderr)
+
+def log_warn(msg:str)->None:
+    print('WARNING: %s' % msg, file=sys.stderr)
+
+def log_info(msg:str)->None:
+    print('%s' % msg, file=sys.stderr)
+
+def log_debug(msg:str)->None:
+    if VERBOSITY > 0:
+        print('%s' % msg, file=sys.stderr)
+
+def log_shell_cmd(cmd:str)->None:
+    if VERBOSITY > 0:
+        print('%% %s' % cmd, file=sys.stderr)
+
+# Apply func to items returned by an iterator in parallel.
+# Returns an iterator with the results of func, in unpredictable
+# order.
+def map_p(func:Callable, it:Iterable)->Iterable:
+    pool = Pool(JOBS)
+    try:
+        for x in pool.imap_unordered(func, it):
+            yield x
+        pool.close()
+        pool.join()
+    except:
+        pool.terminate()
+        pool.join()
+        raise
+    pass
+
+# Remove a file if it exists. Raise an exception on directories.
+def remove_file(filename:str)->None:
+    try:
+        os.unlink(filename)
+    except FileNotFoundError:
+        pass
+
+# Sort a file, ie replace it with a sorted version
+def sort_file_inplace(filename:str, tmp_filename:str)->None:
+    cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', tmp_filename, filename ]
+    log_shell_cmd(shlex.join(cmd))
+    subprocess.run(cmd, check=True)
+    log_debug('rename(%s,%s)' % (tmp_filename, filename))
+    os.unlink(filename)
+    os.rename(tmp_filename, filename)
+
+# Combine old index files into one and sort it by checksum
+# Output saved to TMP_DIR/old_index.list
+def combine_old_index_files():
+    if OLD_INDEX_FILES:
+        log_info('Combining old index files into one')
+        # Use 'awk' to add StxChecksums' base directory to each relative filename in it,
+        # for each input file, otherwise we won't be able to find the referenced file
+        # later when we read these entries.
+        #
+        # Pipe awk's output to sort
+        #
+        # ie: ( awk [...] StxChecksums_1 ; awk [...] StxChecksums_2 ; ... ) | sort [...]
+
+        # Start the sort process, reading from STDIN
+        combined_index_file = os.path.join(TMP_DIR, 'old_index.list')
+        sort_cmd = [ 'sort', '--parallel=%s' % JOBS, '--output=%s' % combined_index_file ]
+        log_shell_cmd(shlex.join(sort_cmd))
+        sort_proc = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE)
+
+        # For each input file, execute AWK with its STDOUT set to sort's STDIN
+        try:
+            dst_dir_realpath = os.path.realpath(DST_DIR)
+            awk_expr = '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }'
+            for old_index_file in OLD_INDEX_FILES:
+                try:
+                    # Skip StxChecksums file that we are (re-)generating now
+                    base_dir = os.path.realpath(os.path.dirname(old_index_file))
+                    if base_dir == dst_dir_realpath and os.path.basename(old_index_file) == 'StxChecksums':
+                        log_warn('Ignoring output index file %s' % old_index_file)
+                        continue
+
+                    # Input file may get deleted by job cleanup scripts from underneath us
+                    # Open the file for reading and pass the open file descriptor to AWK
+                    with open(old_index_file) as old_index_fh:
+                        os.set_inheritable(old_index_fh.fileno(), True)
+                        log_debug('fd %d = %s' % (old_index_fh.fileno(), old_index_file))
+                        awk_cmd = [ 'awk', '-v', 'DIR=%s/' % base_dir, awk_expr, '/dev/fd/%d' % old_index_fh.fileno() ]
+                        log_shell_cmd(shlex.join(awk_cmd))
+                        subprocess.run(awk_cmd, stdout=sort_proc.stdin, check=True, close_fds=False)
+                except OSError as e:
+                    # Ignore errors (typically ENOENT) -- fall back to copy elsewhere
+                    subprocess.warn('Failed to process %s: %s' % old_index_file, str(e))
+                    continue
+        finally:
+            sort_proc.stdin.close()
+            sort_proc.wait()
+            if sort_proc.returncode != 0:
+                raise subprocess.CalledProcessError(returncode=sort_proc.returncode, cmd=sort_cmd)
+
+
+# Format a line of StxChecksums file
+def format_index_line(rel_path:str, orig_path:str, checksum:str, st:os.stat_result)->str:
+    return '%s %s %d %d %d %d %s' % (checksum, rel_path, st.st_size, st.st_mtime, st.st_dev, st.st_ino, orig_path)
+
+# File information for intermediate file lists
+@dataclass
+class FileInfo:
+    dev:int
+    ino:int
+    uid:int
+    gid:int
+    mode:int
+    size:int
+    mtime:float
+    checksum:str
+    rel_path:str
+
+# Create a FileInfo object from a stat record
+def stat_to_file_info(st:os.stat_result, checksum:str, rel_path:str)->FileInfo:
+    return FileInfo(st.st_dev, st.st_ino, st.st_uid, st.st_gid, st.st_mode, st.st_size, st.st_mtime, checksum, rel_path)
+
+# Format a FileInfo record as a line of text
+#    DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
+def format_file_info(fi:FileInfo)->str:
+    return '%d %d %d %d %d %d %f %s %s' % (fi.dev, fi.ino, fi.uid, fi.gid, fi.mode, fi.size, fi.mtime, fi.checksum, fi.rel_path)
+
+# Parse a line of text into a FileInfo object
+#    DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
+RE_FILE_INFO = re.compile(r'^(\d+) (\d+) (\d+) (\d+) (\d+) (\d+) (\S+) (-|[0-9a-f]+) (.+)$', re.ASCII)
+def parse_file_info(line:str)->FileInfo:
+    match = RE_FILE_INFO.match(line)
+    if match:
+        return FileInfo(
+            int(match.group(1)),  # dev
+            int(match.group(2)),  # ino
+            int(match.group(3)),  # uid
+            int(match.group(4)),  # gid
+            int(match.group(5)),  # mode
+            int(match.group(6)),  # size
+            float(match.group(7)),  # mtime
+            match.group(8),       # checksum
+            match.group(9),       # rel_path
+        )
+    return None
+
+# Read a list of FileInfo objects from a file
+def read_file_info_lines(filename:str)->Iterable[FileInfo]:
+    with open(filename) as fh:
+        for line in fh:
+            fi = parse_file_info(line.rstrip('\n'))
+            if fi:
+                yield fi
+#
+# Find a hardlink candidate among the index (StxChecksums) files
+# generated by older builds.
+# Returns an iterator of tuples (old_path, stat_result), or None.
+#
+RE_OLD_FILE_INFO_LIST = [
+    # Faster, but won't match filenames with spaces in them
+    re.compile(r'^([0-9a-f]+) (\S+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII),
+    # Slower (because of .+ in the middle)
+    re.compile(r'^([0-9a-f]+) (.+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII)
+]
+def find_old_files(checksum:str)->Iterable[tuple[str, os.stat_result]]:
+    # If there are no index files => no combined index either
+    if OLD_INDEX_FILES:
+        cmd = [ 'look', '%s ' % checksum, os.path.join(TMP_DIR, 'old_index.list') ]
+        log_shell_cmd(shlex.join(cmd))
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding='utf8')
+        try:
+            for line in p.stdout:
+                line = line.rstrip('\n')
+                re_match_found = False
+                for regex in RE_OLD_FILE_INFO_LIST:
+                    match = regex.match(line)
+                    if match:
+                        re_match_found = True
+                        full_path = match.group(2)
+                        size = int(match.group(3))
+                        mtime = int(match.group(4))
+                        try:
+                            st = os.stat(full_path, follow_symlinks=False)
+                            # NOTE: index files store time stamps as integer's (ie truncated)
+                            if st.st_size == size and int(st.st_mtime) == mtime:
+                                yield (full_path, st)
+                            else:
+                                log_debug('ignoring old index entry because its metadata doesn\'t match reality [%s] size=%d:%d mtime=%d:%d' % (line, size, st.st_size, mtime, int(st.st_mtime)))
+                        except FileNotFoundError:
+                            log_debug('ignoring old index entry because the referenced file doesn\'t exist: %s' % full_path)
+                        except OSError as e:
+                            log_warn('ignoring old index entry: %s: %s' % (full_path, str(e)))
+                if not re_match_found:
+                    log_warn('Failed to parse (old) index line [%s]' % line)
+        finally:
+            p.stdout.close()
+            p.wait()
+
+#
+# Search SRC_DIR and save the FileInfo entries to 3 files:
+#   dirs.list  -- directores
+#   files.list -- non-directories with unique dev/ino
+#   links.list -- duplicate dev/inos
+#
+# All files will have the checksum field set to "-"; we will calculate
+# the checksums separately for files.list.
+#
+# Returns a tuple with total counts.
+#
+def find_files()->tuple[int,int,int]:
+
+    log_info("searching for files")
+
+    dirs_file = os.path.join(TMP_DIR, 'dirs.list')
+    dirs_fh = None
+    dirs_count = 0
+
+    files_file = os.path.join(TMP_DIR, 'files.list')
+    files_fh = None
+    files_count = 0
+
+    links_file = os.path.join(TMP_DIR, 'links.list')
+    links_fh = None
+    links_count = 0
+
+    try:
+        log_debug('creating %s' % dirs_file)
+        dirs_fh = open(dirs_file, 'w')
+
+        log_debug('creating %s' % files_file)
+        files_fh = open(files_file, 'w')
+
+        log_debug('creating %s' % links_file)
+        links_fh = open(links_file, 'w')
+
+        dev_map = {}
+
+        def walk_error(err:Exception)->None:
+            raise err
+
+        dirs_count = 0
+        files_count = 0
+        links_count = 0
+
+        log_debug('  %s/' % '.')
+        st = os.stat(SRC_DIR, follow_symlinks=False)
+        print('%s' % format_file_info(stat_to_file_info(st, '-', '.')), file=dirs_fh)
+        dirs_count += 1
+
+        for (dirpath, dirnames, filenames) in os.walk(SRC_DIR, onerror=walk_error):
+            rel_dirpath = dirpath[len(SRC_DIR)+1:]
+            extra_files = []
+
+            # directories
+            for dirname in dirnames:
+                full_path = os.path.join(dirpath, dirname)
+                st = os.stat(full_path, follow_symlinks=False)
+                # os.walk() returns directory symlinks as "directories" here.
+                # Treat them as any other non-directory file below
+                if stat.S_ISDIR(st.st_mode):
+                    rel_path = os.path.join(rel_dirpath, dirname)
+                    log_debug('  %s/' % rel_path)
+                    print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=dirs_fh)
+                    dirs_count += 1
+                else:
+                    extra_files.append(dirname)
+
+            # files
+            for filename in itertools.chain.from_iterable([filenames, extra_files]):
+                rel_path = os.path.join(rel_dirpath, filename)
+                full_path = os.path.join(dirpath, filename)
+                log_debug ('  %s' % rel_path)
+                st = os.stat(full_path, follow_symlinks=False)
+                ino_map = dev_map.get(st.st_dev)
+                if ino_map is None:
+                    ino_map = {}
+                    dev_map[st.st_dev] = ino_map
+                if st.st_ino not in ino_map:
+                    ino_map[st.st_ino] = None
+                    fh = files_fh
+                    files_count += 1
+                else:
+                    fh = links_fh
+                    links_count += 1
+                print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=fh)
+
+    finally:
+        for fh in (links_fh, files_fh, dirs_fh):
+            if fh is not None:
+                fh.close()
+
+    # Sort files.list because we need to look up duplicate devno/ino entries
+    # there for creating links
+    sort_file_inplace(files_file, '%s.tmp' % files_file)
+
+    log_info ('found dirs=%d files=%d links=%d' % (dirs_count, files_count, links_count))
+    return (dirs_count, files_count, links_count)
+
+# Get the SHA256 of a file
+def get_sha256(path:str)->str:
+    with open(path, "rb") as f:
+        file_hash = hashlib.sha256()
+        while chunk := f.read(CHECKSUM_READ_SIZE):
+            file_hash.update(chunk)
+        return file_hash.hexdigest()
+
+# Calculate and add the checksum given a FileInfo and return
+# the updated FileInfo. Make no changes for non-regfiles.
+def add_one_checksum(fi:FileInfo)->FileInfo:
+    if stat.S_ISREG(fi.mode):
+        src_path = os.path.join(SRC_DIR, fi.rel_path)
+        log_debug('sha256(%s)' % src_path)
+        fi.checksum = get_sha256(src_path)
+    return fi
+
+#
+# Add checksums and sort files.list
+#
+def calc_checksums(files_count:int)->None:
+    log_info("calculating checksums, count=%d" % files_count)
+
+    list_file = os.path.join(TMP_DIR, 'files.list')
+    tmp_list_file = os.path.join(TMP_DIR, 'files.list.tmp')
+
+    log_debug('creating sorted %s' % tmp_list_file)
+    with open(tmp_list_file, 'w') as fh:
+        fi_iter = read_file_info_lines(list_file)
+        for fi in map_p(add_one_checksum, fi_iter):
+            print(format_file_info(fi), file=fh)
+
+    cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', list_file, tmp_list_file ]
+    log_shell_cmd(shlex.join(cmd))
+    subprocess.run(cmd, check=True)
+    os.unlink(tmp_list_file)
+
+#
+# Create directores at destination
+#
+def create_dirs(dirs_count:int)->None:
+    log_info("creating directories, count=%d" % dirs_count)
+    for fi in read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list')):
+        if fi.rel_path == '.':
+            path = DST_DIR
+        else:
+            path = os.path.join(DST_DIR, fi.rel_path)
+
+        dst_exists = False
+        try:
+            st = os.stat(path)
+            if stat.S_ISDIR(st.st_mode):
+                dst_exists = True
+            else:
+                remove_file(path)
+        except FileNotFoundError:
+            pass
+
+        if not dst_exists:
+            log_debug('mkdir(%s)' % path)
+            os.mkdir(path)
+
+        # If we are not root, set directory permissions to be
+        # writable by owner, because we will be creating files
+        # there. This will fail if destination directory is not
+        # already owned by us (to be expected).
+        if os.geteuid() != 0:
+            log_debug('chmod(%s, 0%o)' % (path, 0o700))
+            # Don't set follow_symlinks because this function
+            # is never called for symlinks
+            os.chmod(path, 0o700)
+
+# Copy a file and its attributes, but change UID/GID as specified
+def do_copy(src_path:str, dst_path:str, new_uid:int, new_gid:int)->None:
+    #log_debug("copy(%s, %s)" % (src_path, dst_path))
+    cmd = [ 'cp', '-a' ]
+    if COPY_REFLINK:
+        cmd.append('--reflink')
+    cmd.append('--no-dereference')
+    cmd.append('--')
+    cmd.append(src_path)
+    cmd.append(dst_path)
+    log_shell_cmd(shlex.join(cmd))
+    subprocess.run(cmd, check=True)
+
+    # Doesn't support reflinks, see https://github.com/python/cpython/issues/81338
+    #shutil.copy2(src_path, dst_path, follow_symlinks=False)
+
+    st = os.stat(dst_path, follow_symlinks=False)
+    if new_gid != st.st_gid or new_uid != st.st_uid:
+        log_debug('chown(%s, %d, %d)' % (dst_path, new_uid, new_gid))
+        os.chown(dst_path, new_uid, new_gid)
+        st = os.stat(dst_path, follow_symlinks=False)
+
+    return st
+
+#
+# Copy or link a regfile:
+#   If there's an older file with the same checksum, link it
+#   Otherwise copy it
+#   If linking fails, also copy it
+#
+# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
+#
+def copy_one_file(fi:FileInfo)->tuple:
+    dst_path = os.path.join(DST_DIR, fi.rel_path)
+    src_path = os.path.join(SRC_DIR, fi.rel_path)
+
+    # Work out target file's UID/GID
+    if CHANGE_GID is not None:
+        new_gid = CHANGE_GID
+    else:
+        new_gid = fi.gid
+    if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
+        new_gid = os.getegid()
+
+    if CHANGE_UID is not None:
+        new_uid = CHANGE_UID
+    else:
+        new_uid = fi.uid
+    if os.geteuid() != 0:
+        new_uid = os.geteuid()
+
+    # Skip existing files
+    if SKIP_EXISTING:
+        try:
+            st = os.stat(dst_path, follow_symlinks=False)
+            if st.st_uid == new_uid and \
+                    st.st_gid == new_gid and \
+                    st.st_size == fi.size and \
+                    st.st_mtime == fi.mtime and \
+                    st.st_mode == fi.mode:
+                log_debug('skipping existing %s' % dst_path)
+                # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
+                return (fi.rel_path, dst_path, fi.checksum, st, 0, 0, 1)
+        except FileNotFoundError:
+            pass
+
+    # Delete destination file if it exists
+    remove_file(dst_path)
+
+    # Regular file: try to link it to a file from an older build
+    if stat.S_ISREG(fi.mode) and fi.checksum != '-':
+
+        # Look up an identical file among the older builds
+        for (old_path, old_st) in find_old_files(fi.checksum):
+            try:
+                log_debug('found link candidate by checksum: %s' % old_path)
+                # Only link old files whose attributes match the source file
+                # except mtime
+                if old_st.st_uid == new_uid and \
+                            old_st.st_gid == new_gid and \
+                            old_st.st_size == fi.size and \
+                            old_st.st_mode == fi.mode:
+                    log_debug('link(%s,%s)' % (old_path, dst_path))
+                    os.link(old_path, dst_path)
+                    dst_stat = os.stat(dst_path, follow_symlinks=False)
+                    # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
+                    return (fi.rel_path, old_path, fi.checksum, dst_stat, 1, 0, 0)
+                break
+            except OSError as e:
+                log_warn('link(old_path,dst_path): %s' % str(e))
+
+    # Checksum not found, or link failed: copy
+    dst_stat = do_copy(src_path, dst_path, new_uid, new_gid)
+    # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
+    return (fi.rel_path, dst_path, fi.checksum, dst_stat, 0, 1, 0)
+
+#
+# Copy files to DST_DIR
+#
+# Returns tuple (total_linked, total_copied, total_skipped)
+#
+def copy_files(files_count:int)->tuple[int,int]:
+    log_info("copying files, count=%d" % files_count)
+    total_linked = 0
+    total_copied = 0
+    total_skipped = 0
+    with open(os.path.join(TMP_DIR, 'files.index'), 'w') as fh:
+        fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'files.list'))
+        for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_file, fi_iter):
+            total_linked += linked
+            total_copied += copied
+            total_skipped += skipped
+            if stat.S_ISREG(st.st_mode):
+                index_line = format_index_line(rel_path, full_path, checksum, st)
+                print('%s' % index_line, file = fh)
+
+    return (total_linked, total_copied, total_skipped)
+
+#
+# Re-create a hardlink at destination, ie create a file
+# as a link to a previously copied file, because it was
+# linked in SRC_DIR.
+#
+# Fall back to copy if link fails.
+#
+# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
+#
+def copy_one_link(fi:FileInfo)->tuple:
+    dst_path = os.path.join(DST_DIR, fi.rel_path)
+    src_path = os.path.join(SRC_DIR, fi.rel_path)
+
+    # Delete destination file if it exists
+    remove_file(dst_path)
+
+    # Try to link it to a file we previously installed in copy_files()
+    # Find the previously-installed source file in files.list, by dev/ino
+    try:
+        cmd = [ 'look', '%d %d ' % (fi.dev, fi.ino), os.path.join(TMP_DIR, 'files.list') ]
+        log_shell_cmd(shlex.join(cmd))
+        cmd_res = subprocess.run(cmd, check=False, encoding='utf8', stdout=subprocess.PIPE).stdout
+        old_fi = parse_file_info(cmd_res)
+        if old_fi:
+            orig_path = os.path.join(DST_DIR, old_fi.rel_path)
+            log_debug('link(%s,%s)' % (orig_path, dst_path))
+            os.link(orig_path, dst_path)
+            st = os.stat(dst_path, follow_symlinks=False)
+            # (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
+            return (fi.rel_path, orig_path, old_fi.checksum, st, 1, 0, 0)
+    except OSError as e:
+        log_warn('failed to link %s: %s' % (dst_path, str(e)))
+
+    # Fall back to copy
+    return copy_one_file(fi)
+
+#
+# Re-create or copy hardlinks at destination
+#
+# Returns tuple (total_linked, total_copied, total_skipped)
+#
+def copy_links(links_count:int)->tuple[int,int]:
+    log_info("copying links, count=%d" % links_count)
+    total_linked = 0
+    total_copied = 0
+    total_skipped = 0
+    with open(os.path.join(TMP_DIR, 'links.index'), 'w') as fh:
+        fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'links.list'))
+        for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_link, fi_iter):
+            total_linked += linked
+            total_copied += copied
+            total_skipped += skipped
+            if stat.S_ISREG(st.st_mode):
+                index_line = format_index_line(rel_path, full_path, checksum, st)
+                print('%s' % index_line, file = fh)
+    return (total_linked, total_copied, total_skipped)
+
+#
+# Set directory permissions & ownership to how they were at the source
+#
+def adjust_one_dir_perms(fi:FileInfo)->FileInfo:
+    path = os.path.join(DST_DIR, fi.rel_path)
+    perms = stat.S_IMODE(fi.mode)
+    log_debug("chmod(%s, 0%o)" % (path, perms))
+    # Don't set follow_symlinks because this function
+    # is never called for symlinks
+    os.chmod(path, perms)
+
+    # At this point target directory exists and is owned
+    # by the current UID:GID due to create_dirs().
+    st = os.stat(path, follow_symlinks=False)
+
+    if CHANGE_GID is not None:
+        new_gid = CHANGE_GID
+    else:
+        new_gid = fi.gid
+    if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
+        new_gid = os.getegid()
+
+    if CHANGE_UID is not None:
+        new_uid = CHANGE_UID
+    else:
+        new_uid = fi.uid
+    if os.geteuid() != 0:
+        new_uid = os.geteuid()
+
+    if new_uid != st.st_uid or new_gid != st.st_gid:
+        log_debug("chown(%s, %d, %d)" % (path, new_uid, new_gid))
+        os.chown(path, new_uid, new_gid, follow_symlinks=False)
+
+    # Set both access time and modification time to modification fime of the
+    # source directory
+    log_debug("utime(%s, (%f, %f))" % (path, fi.mtime, fi.mtime))
+    os.utime(path, (fi.mtime, fi.mtime))
+
+    return fi
+
+#
+# Adjust directory permissions & ownership at destination
+#
+def adjust_dir_perms(dirs_count:int)->None:
+    log_info("adjusting directory permissions, count=%d" % dirs_count)
+    fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list'))
+    for fi in map_p(adjust_one_dir_perms, fi_iter):
+        pass
+
+# Save or print "standard" index (StxChecksums) for regfiles and links
+def save_index(files_count:int, links_count:int)->None:
+    files_index_file = os.path.join(TMP_DIR, 'files.index')
+    links_index_file = os.path.join(TMP_DIR, 'links.index')
+    full_index_file = os.path.join(DST_DIR, 'StxChecksums')
+    log_info('creating index, count=%d' % (files_count + links_count))
+
+    sort_cmd = [ 'sort', '--parallel=%d' % JOBS, '--output=%s' % full_index_file, files_index_file, links_index_file ]
+
+    log_shell_cmd(shlex.join(sort_cmd))
+    subprocess.run(sort_cmd, check=True)
+
+# Delete temp files
+def cleanup():
+    if not KEEP_TEMP_FILES:
+        tmp_files = [
+            'dirs.list',
+            'files.index',
+            'files.list',
+            'links.index',
+            'links.list',
+            'old_index.list',
+        ]
+        for file in tmp_files:
+            remove_file(os.path.join(TMP_DIR, file))
+
+# process command line
+def init()->None:
+    def positive_integer(s:str)->int:
+        v = int(s)
+        if v < 1:
+            raise ValueError()
+        return v
+    def user_id(s:str)->int:
+        try:
+            uid = int(s)
+        except:
+            try:
+                uid = pwd.getpwnam(s).pw_uid
+            except:
+                raise ValueError()
+        if uid < 0:
+            raise ValueError
+        return uid
+    def group_id(s:str)->int:
+        try:
+            uid = int(s)
+        except:
+            try:
+                uid = grp.getgrnam(s).gr_gid
+            except:
+                raise ValueError()
+        if uid < 0:
+            raise ValueError
+        return uid
+
+    p = argparse.ArgumentParser()
+    p.add_argument('-j', '--jobs', type=positive_integer, default=1)
+    p.add_argument('--owner', type=user_id)
+    p.add_argument('--group', type=group_id)
+    p.add_argument('--checksum-hardlink', action='store_true', default=False)
+    p.add_argument('--old-index-files-from')
+    p.add_argument('--output-checksums')
+    p.add_argument('--skip-existing', action='store_true', default=False)
+    p.add_argument('-v', '--verbose', action='count', default=0, dest='verbosity')
+    p.add_argument('--reflink', action='store_true', default=False)
+    p.add_argument('--keep-temp-files', action='store_true', default=False)
+    p.add_argument('SRC_DIR')
+    p.add_argument('DST_DIR')
+    p.add_argument('TMP_DIR')
+    p.add_argument('old_index_files', nargs='*')
+    p.format_help = lambda: HELP
+    args = p.parse_args()
+
+    current_gid_list = [ os.getegid(), *os.getgroups() ]
+    if args.owner is not None:
+        if os.geteuid() != 0 and args.owner != os.geteuid():
+            log_error('--owner can only be changed by root')
+            sys.exit(1)
+    if args.group is not None:
+        if os.geteuid() != 0 and args.group not in current_gid_list:
+            log_error('--group can only be changed by root; or it must be a group you are a member of')
+            sys.exit(1)
+
+    existing_old_index_files = []
+    if args.checksum_hardlink:
+        old_index_files = []
+        old_index_files += args.old_index_files
+        if args.old_index_files_from:
+            with open(args.old_index_files_from) as fh:
+                for filename in fh:
+                    filename = filename.rstrip()
+                    old_index_files.append(filename)
+        # Ignore missing/non-readable files because they may disappear
+        # while this script is running
+        for filename in old_index_files:
+            try:
+                with open(filename) as ref_fh:
+                    existing_old_index_files.append(filename)
+            except OSError as x:
+                log_warn('Ignoring index file %s: %s' % (filename, str(x)))
+    elif args.old_index_files:
+        log_warn('old index files are meaningless without --checksum-hardlink')
+
+    global JOBS, CHANGE_UID, CHANGE_GID, CURRENT_GID_LIST
+    global VERBOSITY, COPY_REFLINK, SRC_DIR, DST_DIR, TMP_DIR
+    global OLD_INDEX_FILES, OUTPUT_INDEX_FILE
+    global KEEP_TEMP_FILES, SKIP_EXISTING
+    JOBS = args.jobs
+    CHANGE_UID = args.owner
+    CHANGE_GID = args.group
+    CURRENT_GID_LIST = current_gid_list
+    VERBOSITY = args.verbosity
+    COPY_REFLINK = args.reflink
+    SRC_DIR = str(Path(args.SRC_DIR).absolute())
+    DST_DIR = str(Path(args.DST_DIR).absolute())
+    TMP_DIR = str(Path(args.TMP_DIR).absolute())
+    OLD_INDEX_FILES = existing_old_index_files
+    OUTPUT_INDEX_FILE = args.output_checksums
+    SKIP_EXISTING = args.skip_existing
+    KEEP_TEMP_FILES = args.keep_temp_files
+
+init()
+
+log_debug('SRC_DIR=%s' % SRC_DIR)
+log_debug('DST_DIR=%s' % DST_DIR)
+log_debug('TMP_DIR=%s' % TMP_DIR)
+log_debug('JOBS=%d' % JOBS)
+if CHANGE_UID:
+    log_debug('CHANGE_UID=%d' % CHANGE_UID)
+if CHANGE_GID:
+    log_debug('CHANGE_GID=%d' % CHANGE_GID)
+log_debug('OLD_INDEX_FILES=%s' % OLD_INDEX_FILES)
+log_debug('KEEP_TEMP_FILES=%d' % KEEP_TEMP_FILES)
+
+if not os.path.isdir(TMP_DIR):
+    os.mkdir(TMP_DIR)
+(dirs_count, files_count, links_count) = find_files()
+calc_checksums(files_count)
+create_dirs(dirs_count)
+combine_old_index_files() # DST_DIR must already exist
+(linked1, copied1, skipped1) = copy_files(files_count)
+(linked2, copied2, skipped2) = copy_links(links_count)
+adjust_dir_perms(dirs_count)
+save_index(files_count, links_count)
+cleanup()
+log_info('%s linked=%d copied=%d skipped=%d' % (DST_DIR, linked1+linked2, copied1+copied2, skipped1+skipped2))
--- a/scripts/helpers/archive-dir.sh
+++ b/scripts/helpers/archive-dir.sh
@@ -1,516 +0,0 @@
-#!/bin/bash
-
-PROGNAME="${BASH_SOURCE[0]##*/}"
-SRC_DIR=
-DST_DIR=
-CHECKSUM_FILES_LIST_FILE=
-DST_CHECKSUMS_FILE=
-CHANGE_OWNER=
-CHANGE_GROUP=
-JOBS=1
-XTRACE=0
-
-usage() {
-    echo -n "\
-Usage: $0 [OPTIONS...] SRC_DIR DST_DIR TMP_DIR
-
-Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
-
- -j,--jobs=N        calculate checksums in parallel (default: 1)
-    --owner=OWNER   set copied file's owner as specified
-    --group=GROUP   set copied file's group as specified
-
-    --output-checksums=CK_FILE
-                    save StxChecksums to this file; by default print it to
-                    STDOUT
-
-    --checksum-hardlink=CK_LIST_FILE
-                    Hardlink destination files if possible. CK_LIST_FILE
-                    must contain a list of existing StxChecksums file names
-                    from previously-archived directories, one per line.
-                    We will use the files with matching properties & checksums
-                    to create hard links in DST_DIR.
-
-    --xtrace        Enable debug output
-
-If executed by root, we will preserve owners/groups of the copied files,
-unless they are overridden on the command line.
-
-If this script is called by non-root, it will create all files with the
-calling user's effective user & group ownership.
-
-"
-    exit 0
-}
-
-cmdline_error() {
-    if [[ "$#" -gt 0 ]] ; then
-        echo "ERROR:" "$@" >&2;
-    fi
-    echo "Type \`$0 --help' for more info" >&2
-    exit 1
-}
-
-check_pipe_status() {
-    local -a pipestatus=(${PIPESTATUS[*]})
-    local -i i
-    for ((i=0; i<${#pipestatus[*]}; ++i)) ; do
-        [[ "${pipestatus[$i]}" -eq 0 ]] || return 1
-    done
-    return 0
-}
-
-# Process command line
-temp=$(getopt -o h,j: --long help,jobs:,owner:,group:,output-checksums:,checksum-hardlink:,xtrace -n "$PROGNAME" -- "$@") || cmdline_error
-eval set -- "$temp"
-while [[ "$#" -gt 0 ]] ; do
-    case "$1" in
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        -j|--jobs)
-            JOBS="$2"
-            if [[ ! "$JOBS" =~ ^[0-9]{1,2}$ || "$JOBS" -le 0 || "$JOBS" -ge 99 ]] ; then
-                cmdline_error "$1 must be an integer [1.99]"
-            fi
-            shift 2
-            ;;
-        --owner)
-            CHANGE_OWNER="$2"
-            shift 2
-            ;;
-        --group)
-            CHANGE_GROUP="$2"
-            shift 2
-            ;;
-        --checksum-hardlink)
-            CHECKSUM_FILES_LIST_FILE="$2"
-            shift 2
-            ;;
-        --output-checksums)
-            DST_CHECKSUMS_FILE="$2"
-            shift 2
-            ;;
-        --xtrace)
-            XTRACE=1
-            shift
-            ;;
-        --)
-            shift
-            break
-            ;;
-        *)
-            cmdline_error
-            ;;
-    esac
-done
-[[ "$#" -ge 3 ]] || cmdline_error "not enough arguments"
-[[ "$#" -le 3 ]] || cmdline_error "too many arguments"
-SRC_DIR="$1"
-DST_DIR="$2"
-TMP_DIR="$3"
-
-if [[ ! "$EGID" ]] ; then
-    EGID="$(id -g)" || exit 1
-fi
-
-if [[ $XTRACE -eq 1 ]] ; then
-    set -x
-fi
-
-# Make sure BSD look is installed
-if ! look --help >/dev/null ; then
-    echo "This script requires \"look\" to be installed" >&2
-    exit 1
-fi
-
-# Check for GNU parallel
-if parallel --help >/dev/null 2>&1 ; then
-    GNU_PARALLEL_EXISTS=1
-else
-    GNU_PARALLEL_EXISTS=0
-fi
-
-set -e
-
-#
-# Combine checksum list files into one
-#
-if [[ "$CHECKSUM_FILES_LIST_FILE" ]] ; then
-    echo $'\n## Combining checksum lists into one' >&2
-    combined_checksums_file="$TMP_DIR/combined_checksums.list"
-    while read -r checksums_file ; do
-        # skip empty lines and comments
-        if echo "$checksums_file" | grep -E '^\s*(#.*)$' ; then
-            continue
-        fi
-        # skip missing files
-        [[ -f "$checksums_file" ]] || continue
-        # add file path to the second token (file name)
-        checksums_dir="$(dirname "$checksums_file")"
-        awk -v "DIR=$checksums_dir/" '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' \
-            "$checksums_file"
-    done <"$CHECKSUM_FILES_LIST_FILE" | sort >"$combined_checksums_file"
-    check_pipe_status
-fi
-
-#
-# Create source file lists
-#
-
-# Cretate a list file with each source file or dir + their stat properties
-echo $'\n## Compiling file list: '"$SRC_DIR" >&2
-full_list_file="$TMP_DIR/full.list"
-( cd "$SRC_DIR" && find -printf 'type=%y owner=%U group=%G mode=%#m size=%s mtime=%T@ name=%p\n' ) \
-    | sed 's#name=[.]/#name=#' \
-    | sed 's#\(mtime=[0-9]\+\)[.][0-9]\+#\1#g' \
-    >"${full_list_file}"
-check_pipe_status
-
-# Create another list file that contains only regular files
-regfile_list_file="$TMP_DIR/regfile.list"
-\grep '^type=f' "$full_list_file" | sort -k 7 >"$regfile_list_file" || exit 1
-
-# Create a list file that contains only directories
-# Sort by the last field "name=..."
-dir_list_file="$TMP_DIR/dir.list"
-\grep '^type=d' "$full_list_file" | sort -k 7 >"$dir_list_file" || exit 1
-
-# Create a list file that contains all other entries (non-dirs & non-files)
-other_list_file="$TMP_DIR/other.list"
-\grep '^type=[^df]' "$full_list_file" | sort -k 7 >"$other_list_file" || exit 1
-
-
-#
-# Usage: process_lines MESSAGE INPUT_FILE FUNC ARGS...
-#
-# Call shell function FUNC in parallel, similar to xargs.
-# We will read lines from INPUT_FILE, then pass some subset of lines
-# to FUNC many times in parallel, until all lines have been processed.
-# Input lines will be appended as additional arguments to FUNC calls.
-#
-# FUNC and any global vars it references must be exported before
-# calling process_lines().
-#
-# MESSAGE will be printed to STDERR before starting
-#
-process_lines() {
-
-    local message="$1" ; shift
-    local input_file="$1" ; shift
-
-    # how many input lines? bail out if 0
-    local line_count
-    line_count="$(cat "$input_file" | wc -l)" || exit 1
-    [[ "$line_count" -gt 0 ]] || return 0
-
-    # How many lines to process at a time. The more the better, but with too
-    # many some child jobs may starve -- cap it at 256
-    local lines_per_job
-    if [[ "$JOBS" -gt 1 ]] ; then
-        let lines_per_job="line_count / JOBS / 2"
-        if [[ "$lines_per_job" -eq 0 ]] ; then
-            lines_per_job=1
-        elif [[ "$lines_per_job" -gt 256 ]] ; then
-            lines_per_job=256
-        fi
-    else
-        lines_per_job=256
-    fi
-
-    echo "** $message [JOBS=$JOBS lines_per_job=$lines_per_job]" >&2
-
-    # Prefer GNU parallel because it can exit early
-    local -a cmd
-    if [[ $GNU_PARALLEL_EXISTS -eq 1 ]] ; then
-        cmd=(parallel --halt now,fail=1 -q -r -d '\n' -n $lines_per_job -P $JOBS "$@")
-    else
-        cmd=(xargs -r -d '\n' -n $lines_per_job -P $JOBS $SHELL -c '"$@"' unused_arg "$@")
-    fi
-    if ! "${cmd[@]}" <"$input_file" ; then
-        echo "ERROR: command failed (\"$message\")" >&2
-        return 1
-    fi
-}
-
-
-#
-# create directories in sort order, ie create parents before
-# children
-#
-echo $'\n## Creating directories: '"$DST_DIR" >&2
-while read -r line ; do
-    [[ -n "$line" ]] || continue
-    name="${line#* name=}"
-    [[ -n "$name" ]] || continue
-    attr_line="${line% name=*}"
-    mode="$(echo "$attr_line" | sed -n -r 's#.*mode=([0-9]+).*#\1#p')"
-    install_args=()
-    if [[ "$CHANGE_OWNER" ]] ; then
-        install_args+=("--owner" "$CHANGE_OWNER")
-    elif [[ $EUID -eq 0 ]] ; then
-        owner="$(echo "$attr_line" | sed -n -r 's#.*owner=([0-9]+).*#\1#p')"
-        install_args+=("--owner" "$owner")
-    fi
-    if [[ "$CHANGE_GROUP" ]] ; then
-        install_args+=("--group" "$CHANGE_GROUP")
-    elif [[ $EUID -eq 0 ]] ; then
-        group="$(echo "$attr_line" | sed -n -r 's#.*group=([0-9]+).*#\1#p')"
-        install_args+=("--group" "$group")
-    fi
-    echo "    MKDIR $name" >&2
-    if [[ -e "$DST_DIR/$name" && ! -d "$DST_DIR/$name" ]] ; then
-        \rm "$DST_DIR/$name" || exit 1
-    fi
-    install -d "${install_args[@]}" "$DST_DIR/$name"
-done <"$dir_list_file" || exit 1
-
-#
-# Copy or hardlink regular files
-#
-echo $'\n## Copying regular files: '"$SRC_DIR" >&2
-
-# helper function to process regular files
-# global vars used:
-#   SRC_DIR
-#   DST_DIR
-#   CHANGE_OWNER
-#   CHANGE_GROUP
-#   EUID (always definedby bash)
-#   EGID
-#   TMP_DIR
-#   XTRACE
-#   combined_checksums_file
-process_regfiles() {
-    if [[ $XTRACE -eq 1 ]] ; then
-        set -x
-    fi
-
-    # Temp file generated by this function. Its name must be unique to
-    # prevent interference from other jobs with -j N.
-    local matching_checksums_file
-    matching_checksums_file="$TMP_DIR/matching_checksums-$$.list"
-
-    local line attr_line
-    for line in "$@" ; do
-
-        # source file name relative to SRC_DIR
-        local name
-        name="${line#* name=}"
-        [[ "$name" ]] || continue
-
-        # all attributes leading up to name=
-        attr_line="${line% name=*}"
-
-        # source checksum
-        local checksum
-        #flock -s "$DST_DIR" echo "    SHA256 $name" >&2
-        checksum="$(sha256sum "$SRC_DIR/$name" | awk '{print $1}')"
-        if [[ ! "$checksum" ]] ; then
-            flock -s "$DST_DIR" echo "$SRC_DIR/$name: failed to calculate checksum" >&2
-            return 1
-        fi
-
-        # source owner; or a user-provided override
-        local -a install_args=()
-        local owner
-        if [[ "$CHANGE_OWNER" ]] ; then
-            owner="$CHANGE_OWNER"
-            install_args+=("--owner" "$owner")
-        elif [[ $EUID -eq 0 ]] ; then
-            owner="$(echo "$attr_line" | sed -n -r 's#.* owner=([0-9]+).*#\1#p')"
-            install_args+=("--owner" "$owner")
-        else
-            owner=$EUID
-        fi
-
-        # source group; or a user-provided override
-        local group
-        if [[ "$CHANGE_GROUP" ]] ; then
-            group="$CHANGE_GROUP"
-            install_args+=("--group" "$group")
-        elif [[ $EGID -eq 0 ]] ; then
-            group="$(echo "$attr_line" | sed -n -r 's#.* group=([0-9]+).*#\1#p')"
-            install_args+=("--group" "$group")
-        else
-            group=$EGID
-        fi
-
-        # source file's mode/permissions
-        local mode
-        mode="$(echo "$attr_line" | sed -n -r 's#.* mode=([^[:space:]]+).*#\1#p')"
-
-        # Search for the checksum in an older StxChecksums file
-        if [[ "$combined_checksums_file" ]] ; then
-            if look "$checksum " "$combined_checksums_file" >"$matching_checksums_file" 2>/dev/null ; then
-                (
-                    # As we read previosuly-archived files properties from StxChecksums,
-                    # make sure they have not changed compared to the actual files on disk.
-                    while read -r ref_checksum ref_name ref_size ref_mtime ref_dev ref_inode ref_path x_rest ; do
-                        [[ -f "$ref_path" ]] || continue
-                        # read on-disk file properties
-                        local ref_stat
-                        ref_stat=($(stat -c '%s %Y %u %g %#04a' "$ref_path" || true))
-                        [[ "${#ref_stat[@]}" -eq 5 ]] || continue
-
-                        # on-disk size does not match StxChecksums
-                        local ref_ondisk_size
-                        ref_ondisk_size="${ref_stat[0]}"
-                        [[ "$ref_size" == "$ref_ondisk_size" ]] || continue
-
-                        # on-disk mtime does not match StxChecksums
-                        local ref_ondisk_mtime
-                        ref_ondisk_mtime="${ref_stat[1]}"
-                        [[ "${ref_mtime}" == "$ref_ondisk_mtime" ]] || continue
-
-                        # on-disk owner does not match requested owner
-                        local ref_ondisk_owner
-                        ref_ondisk_owner="${ref_stat[2]}"
-                        [[ "${owner}" == "$ref_ondisk_owner" ]] || continue
-
-                        # on-disk group does not match requested group
-                        local ref_ondisk_group
-                        ref_ondisk_group="${ref_stat[3]}"
-                        [[ "${group}" == "$ref_ondisk_group" ]] || continue
-
-                        # on-disk mode does not match the mode of the source file
-                        ref_ondisk_mode="${ref_stat[4]}"
-                        [[ "${mode}" == "$ref_ondisk_mode" ]] || continue
-
-                        # At this point checksum, size, mtime, mode, owner, group and checksums of the
-                        # exsiting file match with the file we are trying to copy. 
-                        # Use that file to create a hardlink.
-                        flock -s "$DST_DIR" echo "    LINK $name (from $ref_name)" >&2
-                        if ln -f "$ref_name" "${DST_DIR}/$name" ; then
-                            flock -s "$DST_DIR" echo "$checksum $name $ref_size $ref_mtime $ref_dev $ref_inode $DST_DIR/$name"
-                            exit 0
-                        fi
-                    done <"$matching_checksums_file"
-                    # checksum not found in older archives
-                    exit 1
-                ) && continue || true
-            fi
-        fi
-
-        # No matching files found: really copy it
-
-        if [[ -e "$DST_DIR/$name" ]] ; then
-            \rm "$DST_DIR/$name" || exit 1
-        fi
-
-        # source file's size & mtime
-        local size mtime
-        size="$(echo "$attr_line" | sed -n -r 's#.* size=([^[:space:]]+).*#\1#p')"
-        mtime="$(echo "$attr_line"  | sed -n -r 's#.* mtime=([^[:space:]]+).*#\1#p')"
-
-        # copy it to $DST_DIR
-        flock -s "$DST_DIR" echo "    COPY $name" >&2
-        rm -f "$DST_DIR/$name" || exit 1
-        install --preserve-timestamps "${install_args[@]}" --mode="$mode" -T "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
-
-        # check destination file properties
-        local dst_stat dst_size dst_dev dst_ino
-        dst_stat=($(stat -c '%s %d %i' "$DST_DIR/$name")) || exit 1
-        dst_size="${dst_stat[0]}"
-        dst_dev="${dst_stat[1]}"
-        dst_ino="${dst_stat[2]}"
-
-        # file changed while copying
-        if [[ "$dst_size" != "$size" ]] ; then
-            flock -s "$DST_DIR" echo "ERROR: $SRC_DIR/$name changed while copying!" >&2
-            exit 1
-        fi
-
-        # print out a line for StxChecksums using source file properties (preserved
-        # during copying), but with destination file's dev & ino.
-        flock -s "$DST_DIR" echo "$checksum $name $size $mtime $dst_dev $dst_ino $DST_DIR/$name"
-    done
-
-    rm -f "$matching_checksums_file"
-}
-
-# process files in parallel
-(
-    if [[ "$DST_CHECKSUMS_FILE" ]] ; then
-        dst_checksums_fd=5
-        exec 5<>"$DST_CHECKSUMS_FILE" || exit 1
-    else
-        dst_checksums_fd=1
-    fi
-
-    export SRC_DIR \
-           DST_DIR \
-           CHANGE_OWNER \
-           CHANGE_GROUP \
-           EGID \
-           TMP_DIR \
-           XTRACE \
-           combined_checksums_file
-
-    export -f process_regfiles
-
-    message="processing regular files"
-    process_lines "$message" "$regfile_list_file" process_regfiles | sort >&$dst_checksums_fd
-    [[ "${PIPESTATUS[0]}" -eq 0 && "${PIPESTATUS[1]}" -eq 0 ]] || exit 1
-) || exit 1
-
-
-#
-# copy special files
-#
-echo  $'\n## Copying special files: '"$DST_DIR" >&2
-
-# helper function for processing special files
-# global vars used:
-#   SRC_DIR
-#   DST_DIR
-#   CHANGE_OWNER
-#   CHANGE_GROUP
-#   XTRACE
-process_other() {
-    if [[ $XTRACE -eq 1 ]] ; then
-        set -x
-    fi
-    local line attr_line
-    for line in "$@" ; do
-        local name
-        name="${line#* name=}"
-        [[ -n "$name" ]] || continue
-        attr_line="${line% name=*}"
-
-        local type
-        type="$(echo "$attr_line" | sed 's#^type=\(.\) .*#\1#g')"
-        [[ -n "$type" ]] || continue
-
-        flock -s "$DST_DIR" echo "    CREATE type=$type $name" >&2
-        if [[ -e "$DST_DIR/$name" ]] ; then
-            rm "$DST_DIR/$name" || exit 1
-        fi
-        cp -a --no-dereference "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
-        if [[ "$CHANGE_OWNER" || "$CHANGE_GROUP" ]] ; then
-            local chown_arg=
-            if [[ "$CHANGE_OWNER" ]] ; then
-                chown_arg="$CHANGE_OWNER"
-            fi
-            if [[ "$CHANGE_GROUP" ]] ; then
-                chown_arg+=":$CHANGE_GROUP"
-            fi
-            chown --no-dereference "$chown_arg" "$DST_DIR/$name" || exit 1
-        fi
-    done
-}
-
-# process them in parallel
-(
-    export SRC_DIR \
-           DST_DIR \
-           CHANGE_OWNER \
-           CHANGE_GROUP \
-           XTRACE
-
-    export -f process_other
-
-    message="processing other files"
-    process_lines "$message" "$other_list_file" process_other || exit 1
-) || exit 1
--- a/scripts/lib/job_utils.sh
+++ b/scripts/lib/job_utils.sh
@@ -35,7 +35,7 @@ export REPO_TRACE=0

 # docker images
 SAFE_RSYNC_DOCKER_IMG="servercontainers/rsync:3.1.3"
-COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20230529"
+COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20250709"

 notice() {
    ( set +x ; print_log -i --notice "$@" ; )