archive-dir.sh: rewrite in Python
Old script was slow and didn't preserve hardlinks within the source set. This script doesn't link files that are identical within the source set, ie same checksum & attributes, but different inode. It can only link such files to similar files from older builds. This deficiency will be addressed in a separate commit. TESTS =================== * Manually test various input directories, including: - a directory that contains each type of file (regular, devices, sockets, symlinks, etc) - old index files with spaces in file names * Given a build with a dozen or so historical builds, copied the "aptly" directory and compared timing and destination directory size before/after this patch: - old script: time=4m13s size=56.0G - new script: time=14s size=6.1G * Run a Jenkins build that rebuilds one package, and doesn't clean/rebuild the ISO. Make sure "archive-misc" works as expected. Change-Id: Ic8f8931c4143bc355db1ccbad56ed772c0f3081e Signed-off-by: Davlet Panech <davlet.panech@windriver.com>
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
FROM debian:11
|
||||
FROM debian:12
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get upgrade -y && \
|
||||
apt-get install -y bsdextrautils parallel && \
|
||||
apt-get install -y bsdextrautils parallel python3 && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
@@ -109,13 +109,14 @@ do_archive_dir() {
|
||||
tmp_dir="$BUILD_HOME/tmp/archive-misc"
|
||||
rm -rf "$tmp_dir/$id"
|
||||
mkdir -p "$tmp_dir/$id"
|
||||
cp -a "$THIS_DIR/helpers/archive-dir.sh" "$tmp_dir/"
|
||||
cp -a "$THIS_DIR/helpers/archive-dir.py" "$tmp_dir/"
|
||||
local archive_args=()
|
||||
if [[ "$spec_method" == "checksum-hardlink" ]] ; then
|
||||
local old_checksums_file_list="$tmp_dir/$id/old_checksums_file.list"
|
||||
local find_func=find_old_checksum_files__$id
|
||||
$find_func >"$old_checksums_file_list"
|
||||
archive_args+=("--checksum-hardlink" "$old_checksums_file_list")
|
||||
archive_args+=("--checksum-hardlink")
|
||||
archive_args+=("--old-index-files-from=$old_checksums_file_list")
|
||||
local extra_checksums_file
|
||||
for extra_checksums_file in "$@" ; do
|
||||
print_regfile_name_if_exists "$extra_checksums_file"
|
||||
@@ -132,10 +133,10 @@ do_archive_dir() {
|
||||
local src_dir="$BUILD_HOME/$dir"
|
||||
local dst_dir="$BUILD_OUTPUT_HOME/$dir"
|
||||
maybe_run mkdir -p "$dst_dir"
|
||||
safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.sh" \
|
||||
safe_docker_run $DRY_RUN_ARG --writeable-archive-root --rm "$COREUTILS_DOCKER_IMG" "$tmp_dir/archive-dir.py" \
|
||||
"${archive_args[@]}" \
|
||||
-j ${PARALLEL_CMD_JOBS:-1} \
|
||||
--output-checksums "$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \
|
||||
--output-checksums="$BUILD_OUTPUT_HOME/$dir/$CHECKSUMS_FILENAME" \
|
||||
"$src_dir" \
|
||||
"$dst_dir" \
|
||||
"$tmp_dir/$id"
|
||||
|
826
scripts/helpers/archive-dir.py
Executable file
826
scripts/helpers/archive-dir.py
Executable file
@@ -0,0 +1,826 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
assert sys.version_info >= (3, 9), "Python >= 3.9 is required"
|
||||
|
||||
HELP="""\
|
||||
Usage: archive-dir.py [<OPTIONS>...] <SRC_DIR> <DST_DIR> <TMP_DIR>
|
||||
[<OLD_INDEX_FILES>...]
|
||||
|
||||
Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
|
||||
|
||||
Create the index file, DST_DIR/StxChecksums.
|
||||
|
||||
With --checksum-hardlink, attempt to link identical files form older builds
|
||||
instead of copying them.
|
||||
|
||||
|
||||
-v,--verbose be verbose
|
||||
|
||||
-j,--jobs=N perform various operations in parallel (default: 1)
|
||||
|
||||
--owner=OWNER set destination files' owner, requires requires root
|
||||
privileges.
|
||||
|
||||
--group=GROUP set desintation files' group as specified; requires root
|
||||
privileges, or current user must be a member of GROUP
|
||||
|
||||
--checksum-hardlink
|
||||
Hardlink destination files if possible. You must provide
|
||||
one or more index files (StxChecksums) generated by older
|
||||
builds. We will use the files with matching properties &
|
||||
checksums to create hard links in DST_DIR.
|
||||
|
||||
--old-index-files-from=OLD_INDEX_LIST_FILE
|
||||
Read additional index file names from INDEX_LIST_FILE
|
||||
|
||||
--reflink Create light-weight (COW) file copies if possible. This
|
||||
only applies when copying (ie when no link candidates
|
||||
found)
|
||||
|
||||
--skip-existing Skip files that already exist at destination. We still need
|
||||
to calculate thir checksums in order to create the index,
|
||||
but we will skip the copy.
|
||||
|
||||
--keep-temp-files
|
||||
Normally we delete temporary files upon successful
|
||||
completion, this option will keep them.
|
||||
|
||||
"""
|
||||
|
||||
# FIXME: this doesn't link files that are identical within the source set,
|
||||
# ie same checksum & attributes, but different inode. It can only link
|
||||
# such files to similar files from older builds.
|
||||
|
||||
import argparse
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
import grp
|
||||
import hashlib
|
||||
import itertools
|
||||
from multiprocessing import Pool
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pwd
|
||||
import re
|
||||
import shutil
|
||||
import shlex
|
||||
import stat
|
||||
import subprocess
|
||||
from typing import TextIO
|
||||
|
||||
JOBS = 1
|
||||
CHECKSUM_READ_SIZE = 4 * 1024 * 1024 # 4 MiB
|
||||
COPY_REFLINK = False
|
||||
OLD_INDEX_FILES = []
|
||||
SKIP_EXISTING = False
|
||||
SRC_DIR = None
|
||||
DST_DIR = None
|
||||
TMP_DIR = None
|
||||
CHANGE_UID = None
|
||||
CHANGE_GID = None
|
||||
VERBOSITY = 0
|
||||
CURRENT_GID_LIST = []
|
||||
OUTPUT_INDEX_FILE = None
|
||||
KEEP_TEMP_FILES = False
|
||||
|
||||
def log_error(msg:str)->None:
|
||||
print('ERROR: %s' % msg, file=sys.stderr)
|
||||
|
||||
def log_warn(msg:str)->None:
|
||||
print('WARNING: %s' % msg, file=sys.stderr)
|
||||
|
||||
def log_info(msg:str)->None:
|
||||
print('%s' % msg, file=sys.stderr)
|
||||
|
||||
def log_debug(msg:str)->None:
|
||||
if VERBOSITY > 0:
|
||||
print('%s' % msg, file=sys.stderr)
|
||||
|
||||
def log_shell_cmd(cmd:str)->None:
|
||||
if VERBOSITY > 0:
|
||||
print('%% %s' % cmd, file=sys.stderr)
|
||||
|
||||
# Apply func to items returned by an iterator in parallel.
|
||||
# Returns an iterator with the results of func, in unpredictable
|
||||
# order.
|
||||
def map_p(func:Callable, it:Iterable)->Iterable:
|
||||
pool = Pool(JOBS)
|
||||
try:
|
||||
for x in pool.imap_unordered(func, it):
|
||||
yield x
|
||||
pool.close()
|
||||
pool.join()
|
||||
except:
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
raise
|
||||
pass
|
||||
|
||||
# Remove a file if it exists. Raise an exception on directories.
|
||||
def remove_file(filename:str)->None:
|
||||
try:
|
||||
os.unlink(filename)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Sort a file, ie replace it with a sorted version
|
||||
def sort_file_inplace(filename:str, tmp_filename:str)->None:
|
||||
cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', tmp_filename, filename ]
|
||||
log_shell_cmd(shlex.join(cmd))
|
||||
subprocess.run(cmd, check=True)
|
||||
log_debug('rename(%s,%s)' % (tmp_filename, filename))
|
||||
os.unlink(filename)
|
||||
os.rename(tmp_filename, filename)
|
||||
|
||||
# Combine old index files into one and sort it by checksum
|
||||
# Output saved to TMP_DIR/old_index.list
|
||||
def combine_old_index_files():
|
||||
if OLD_INDEX_FILES:
|
||||
log_info('Combining old index files into one')
|
||||
# Use 'awk' to add StxChecksums' base directory to each relative filename in it,
|
||||
# for each input file, otherwise we won't be able to find the referenced file
|
||||
# later when we read these entries.
|
||||
#
|
||||
# Pipe awk's output to sort
|
||||
#
|
||||
# ie: ( awk [...] StxChecksums_1 ; awk [...] StxChecksums_2 ; ... ) | sort [...]
|
||||
|
||||
# Start the sort process, reading from STDIN
|
||||
combined_index_file = os.path.join(TMP_DIR, 'old_index.list')
|
||||
sort_cmd = [ 'sort', '--parallel=%s' % JOBS, '--output=%s' % combined_index_file ]
|
||||
log_shell_cmd(shlex.join(sort_cmd))
|
||||
sort_proc = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE)
|
||||
|
||||
# For each input file, execute AWK with its STDOUT set to sort's STDIN
|
||||
try:
|
||||
dst_dir_realpath = os.path.realpath(DST_DIR)
|
||||
awk_expr = '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }'
|
||||
for old_index_file in OLD_INDEX_FILES:
|
||||
try:
|
||||
# Skip StxChecksums file that we are (re-)generating now
|
||||
base_dir = os.path.realpath(os.path.dirname(old_index_file))
|
||||
if base_dir == dst_dir_realpath and os.path.basename(old_index_file) == 'StxChecksums':
|
||||
log_warn('Ignoring output index file %s' % old_index_file)
|
||||
continue
|
||||
|
||||
# Input file may get deleted by job cleanup scripts from underneath us
|
||||
# Open the file for reading and pass the open file descriptor to AWK
|
||||
with open(old_index_file) as old_index_fh:
|
||||
os.set_inheritable(old_index_fh.fileno(), True)
|
||||
log_debug('fd %d = %s' % (old_index_fh.fileno(), old_index_file))
|
||||
awk_cmd = [ 'awk', '-v', 'DIR=%s/' % base_dir, awk_expr, '/dev/fd/%d' % old_index_fh.fileno() ]
|
||||
log_shell_cmd(shlex.join(awk_cmd))
|
||||
subprocess.run(awk_cmd, stdout=sort_proc.stdin, check=True, close_fds=False)
|
||||
except OSError as e:
|
||||
# Ignore errors (typically ENOENT) -- fall back to copy elsewhere
|
||||
subprocess.warn('Failed to process %s: %s' % old_index_file, str(e))
|
||||
continue
|
||||
finally:
|
||||
sort_proc.stdin.close()
|
||||
sort_proc.wait()
|
||||
if sort_proc.returncode != 0:
|
||||
raise subprocess.CalledProcessError(returncode=sort_proc.returncode, cmd=sort_cmd)
|
||||
|
||||
|
||||
# Format a line of StxChecksums file
|
||||
def format_index_line(rel_path:str, orig_path:str, checksum:str, st:os.stat_result)->str:
|
||||
return '%s %s %d %d %d %d %s' % (checksum, rel_path, st.st_size, st.st_mtime, st.st_dev, st.st_ino, orig_path)
|
||||
|
||||
# File information for intermediate file lists
|
||||
@dataclass
|
||||
class FileInfo:
|
||||
dev:int
|
||||
ino:int
|
||||
uid:int
|
||||
gid:int
|
||||
mode:int
|
||||
size:int
|
||||
mtime:float
|
||||
checksum:str
|
||||
rel_path:str
|
||||
|
||||
# Create a FileInfo object from a stat record
|
||||
def stat_to_file_info(st:os.stat_result, checksum:str, rel_path:str)->FileInfo:
|
||||
return FileInfo(st.st_dev, st.st_ino, st.st_uid, st.st_gid, st.st_mode, st.st_size, st.st_mtime, checksum, rel_path)
|
||||
|
||||
# Format a FileInfo record as a line of text
|
||||
# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
|
||||
def format_file_info(fi:FileInfo)->str:
|
||||
return '%d %d %d %d %d %d %f %s %s' % (fi.dev, fi.ino, fi.uid, fi.gid, fi.mode, fi.size, fi.mtime, fi.checksum, fi.rel_path)
|
||||
|
||||
# Parse a line of text into a FileInfo object
|
||||
# DEV INO UID GID MODE SIZE MTIME CHECKSUM REL_PATH
|
||||
RE_FILE_INFO = re.compile(r'^(\d+) (\d+) (\d+) (\d+) (\d+) (\d+) (\S+) (-|[0-9a-f]+) (.+)$', re.ASCII)
|
||||
def parse_file_info(line:str)->FileInfo:
|
||||
match = RE_FILE_INFO.match(line)
|
||||
if match:
|
||||
return FileInfo(
|
||||
int(match.group(1)), # dev
|
||||
int(match.group(2)), # ino
|
||||
int(match.group(3)), # uid
|
||||
int(match.group(4)), # gid
|
||||
int(match.group(5)), # mode
|
||||
int(match.group(6)), # size
|
||||
float(match.group(7)), # mtime
|
||||
match.group(8), # checksum
|
||||
match.group(9), # rel_path
|
||||
)
|
||||
return None
|
||||
|
||||
# Read a list of FileInfo objects from a file
|
||||
def read_file_info_lines(filename:str)->Iterable[FileInfo]:
|
||||
with open(filename) as fh:
|
||||
for line in fh:
|
||||
fi = parse_file_info(line.rstrip('\n'))
|
||||
if fi:
|
||||
yield fi
|
||||
#
|
||||
# Find a hardlink candidate among the index (StxChecksums) files
|
||||
# generated by older builds.
|
||||
# Returns an iterator of tuples (old_path, stat_result), or None.
|
||||
#
|
||||
RE_OLD_FILE_INFO_LIST = [
|
||||
# Faster, but won't match filenames with spaces in them
|
||||
re.compile(r'^([0-9a-f]+) (\S+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII),
|
||||
# Slower (because of .+ in the middle)
|
||||
re.compile(r'^([0-9a-f]+) (.+) (\d+) (\d+) (\d+) (\d+) (.+)$', re.ASCII)
|
||||
]
|
||||
def find_old_files(checksum:str)->Iterable[tuple[str, os.stat_result]]:
|
||||
# If there are no index files => no combined index either
|
||||
if OLD_INDEX_FILES:
|
||||
cmd = [ 'look', '%s ' % checksum, os.path.join(TMP_DIR, 'old_index.list') ]
|
||||
log_shell_cmd(shlex.join(cmd))
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding='utf8')
|
||||
try:
|
||||
for line in p.stdout:
|
||||
line = line.rstrip('\n')
|
||||
re_match_found = False
|
||||
for regex in RE_OLD_FILE_INFO_LIST:
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
re_match_found = True
|
||||
full_path = match.group(2)
|
||||
size = int(match.group(3))
|
||||
mtime = int(match.group(4))
|
||||
try:
|
||||
st = os.stat(full_path, follow_symlinks=False)
|
||||
# NOTE: index files store time stamps as integer's (ie truncated)
|
||||
if st.st_size == size and int(st.st_mtime) == mtime:
|
||||
yield (full_path, st)
|
||||
else:
|
||||
log_debug('ignoring old index entry because its metadata doesn\'t match reality [%s] size=%d:%d mtime=%d:%d' % (line, size, st.st_size, mtime, int(st.st_mtime)))
|
||||
except FileNotFoundError:
|
||||
log_debug('ignoring old index entry because the referenced file doesn\'t exist: %s' % full_path)
|
||||
except OSError as e:
|
||||
log_warn('ignoring old index entry: %s: %s' % (full_path, str(e)))
|
||||
if not re_match_found:
|
||||
log_warn('Failed to parse (old) index line [%s]' % line)
|
||||
finally:
|
||||
p.stdout.close()
|
||||
p.wait()
|
||||
|
||||
#
|
||||
# Search SRC_DIR and save the FileInfo entries to 3 files:
|
||||
# dirs.list -- directores
|
||||
# files.list -- non-directories with unique dev/ino
|
||||
# links.list -- duplicate dev/inos
|
||||
#
|
||||
# All files will have the checksum field set to "-"; we will calculate
|
||||
# the checksums separately for files.list.
|
||||
#
|
||||
# Returns a tuple with total counts.
|
||||
#
|
||||
def find_files()->tuple[int,int,int]:
|
||||
|
||||
log_info("searching for files")
|
||||
|
||||
dirs_file = os.path.join(TMP_DIR, 'dirs.list')
|
||||
dirs_fh = None
|
||||
dirs_count = 0
|
||||
|
||||
files_file = os.path.join(TMP_DIR, 'files.list')
|
||||
files_fh = None
|
||||
files_count = 0
|
||||
|
||||
links_file = os.path.join(TMP_DIR, 'links.list')
|
||||
links_fh = None
|
||||
links_count = 0
|
||||
|
||||
try:
|
||||
log_debug('creating %s' % dirs_file)
|
||||
dirs_fh = open(dirs_file, 'w')
|
||||
|
||||
log_debug('creating %s' % files_file)
|
||||
files_fh = open(files_file, 'w')
|
||||
|
||||
log_debug('creating %s' % links_file)
|
||||
links_fh = open(links_file, 'w')
|
||||
|
||||
dev_map = {}
|
||||
|
||||
def walk_error(err:Exception)->None:
|
||||
raise err
|
||||
|
||||
dirs_count = 0
|
||||
files_count = 0
|
||||
links_count = 0
|
||||
|
||||
log_debug(' %s/' % '.')
|
||||
st = os.stat(SRC_DIR, follow_symlinks=False)
|
||||
print('%s' % format_file_info(stat_to_file_info(st, '-', '.')), file=dirs_fh)
|
||||
dirs_count += 1
|
||||
|
||||
for (dirpath, dirnames, filenames) in os.walk(SRC_DIR, onerror=walk_error):
|
||||
rel_dirpath = dirpath[len(SRC_DIR)+1:]
|
||||
extra_files = []
|
||||
|
||||
# directories
|
||||
for dirname in dirnames:
|
||||
full_path = os.path.join(dirpath, dirname)
|
||||
st = os.stat(full_path, follow_symlinks=False)
|
||||
# os.walk() returns directory symlinks as "directories" here.
|
||||
# Treat them as any other non-directory file below
|
||||
if stat.S_ISDIR(st.st_mode):
|
||||
rel_path = os.path.join(rel_dirpath, dirname)
|
||||
log_debug(' %s/' % rel_path)
|
||||
print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=dirs_fh)
|
||||
dirs_count += 1
|
||||
else:
|
||||
extra_files.append(dirname)
|
||||
|
||||
# files
|
||||
for filename in itertools.chain.from_iterable([filenames, extra_files]):
|
||||
rel_path = os.path.join(rel_dirpath, filename)
|
||||
full_path = os.path.join(dirpath, filename)
|
||||
log_debug (' %s' % rel_path)
|
||||
st = os.stat(full_path, follow_symlinks=False)
|
||||
ino_map = dev_map.get(st.st_dev)
|
||||
if ino_map is None:
|
||||
ino_map = {}
|
||||
dev_map[st.st_dev] = ino_map
|
||||
if st.st_ino not in ino_map:
|
||||
ino_map[st.st_ino] = None
|
||||
fh = files_fh
|
||||
files_count += 1
|
||||
else:
|
||||
fh = links_fh
|
||||
links_count += 1
|
||||
print('%s' % format_file_info(stat_to_file_info(st, '-', rel_path)), file=fh)
|
||||
|
||||
finally:
|
||||
for fh in (links_fh, files_fh, dirs_fh):
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
|
||||
# Sort files.list because we need to look up duplicate devno/ino entries
|
||||
# there for creating links
|
||||
sort_file_inplace(files_file, '%s.tmp' % files_file)
|
||||
|
||||
log_info ('found dirs=%d files=%d links=%d' % (dirs_count, files_count, links_count))
|
||||
return (dirs_count, files_count, links_count)
|
||||
|
||||
# Get the SHA256 of a file
|
||||
def get_sha256(path:str)->str:
|
||||
with open(path, "rb") as f:
|
||||
file_hash = hashlib.sha256()
|
||||
while chunk := f.read(CHECKSUM_READ_SIZE):
|
||||
file_hash.update(chunk)
|
||||
return file_hash.hexdigest()
|
||||
|
||||
# Calculate and add the checksum given a FileInfo and return
|
||||
# the updated FileInfo. Make no changes for non-regfiles.
|
||||
def add_one_checksum(fi:FileInfo)->FileInfo:
|
||||
if stat.S_ISREG(fi.mode):
|
||||
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
||||
log_debug('sha256(%s)' % src_path)
|
||||
fi.checksum = get_sha256(src_path)
|
||||
return fi
|
||||
|
||||
#
|
||||
# Add checksums and sort files.list
|
||||
#
|
||||
def calc_checksums(files_count:int)->None:
|
||||
log_info("calculating checksums, count=%d" % files_count)
|
||||
|
||||
list_file = os.path.join(TMP_DIR, 'files.list')
|
||||
tmp_list_file = os.path.join(TMP_DIR, 'files.list.tmp')
|
||||
|
||||
log_debug('creating sorted %s' % tmp_list_file)
|
||||
with open(tmp_list_file, 'w') as fh:
|
||||
fi_iter = read_file_info_lines(list_file)
|
||||
for fi in map_p(add_one_checksum, fi_iter):
|
||||
print(format_file_info(fi), file=fh)
|
||||
|
||||
cmd = [ 'sort', '--parallel=%d' % JOBS, '-o', list_file, tmp_list_file ]
|
||||
log_shell_cmd(shlex.join(cmd))
|
||||
subprocess.run(cmd, check=True)
|
||||
os.unlink(tmp_list_file)
|
||||
|
||||
#
|
||||
# Create directores at destination
|
||||
#
|
||||
def create_dirs(dirs_count:int)->None:
|
||||
log_info("creating directories, count=%d" % dirs_count)
|
||||
for fi in read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list')):
|
||||
if fi.rel_path == '.':
|
||||
path = DST_DIR
|
||||
else:
|
||||
path = os.path.join(DST_DIR, fi.rel_path)
|
||||
|
||||
dst_exists = False
|
||||
try:
|
||||
st = os.stat(path)
|
||||
if stat.S_ISDIR(st.st_mode):
|
||||
dst_exists = True
|
||||
else:
|
||||
remove_file(path)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
if not dst_exists:
|
||||
log_debug('mkdir(%s)' % path)
|
||||
os.mkdir(path)
|
||||
|
||||
# If we are not root, set directory permissions to be
|
||||
# writable by owner, because we will be creating files
|
||||
# there. This will fail if destination directory is not
|
||||
# already owned by us (to be expected).
|
||||
if os.geteuid() != 0:
|
||||
log_debug('chmod(%s, 0%o)' % (path, 0o700))
|
||||
# Don't set follow_symlinks because this function
|
||||
# is never called for symlinks
|
||||
os.chmod(path, 0o700)
|
||||
|
||||
# Copy a file and its attributes, but change UID/GID as specified
|
||||
def do_copy(src_path:str, dst_path:str, new_uid:int, new_gid:int)->None:
|
||||
#log_debug("copy(%s, %s)" % (src_path, dst_path))
|
||||
cmd = [ 'cp', '-a' ]
|
||||
if COPY_REFLINK:
|
||||
cmd.append('--reflink')
|
||||
cmd.append('--no-dereference')
|
||||
cmd.append('--')
|
||||
cmd.append(src_path)
|
||||
cmd.append(dst_path)
|
||||
log_shell_cmd(shlex.join(cmd))
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
# Doesn't support reflinks, see https://github.com/python/cpython/issues/81338
|
||||
#shutil.copy2(src_path, dst_path, follow_symlinks=False)
|
||||
|
||||
st = os.stat(dst_path, follow_symlinks=False)
|
||||
if new_gid != st.st_gid or new_uid != st.st_uid:
|
||||
log_debug('chown(%s, %d, %d)' % (dst_path, new_uid, new_gid))
|
||||
os.chown(dst_path, new_uid, new_gid)
|
||||
st = os.stat(dst_path, follow_symlinks=False)
|
||||
|
||||
return st
|
||||
|
||||
#
|
||||
# Copy or link a regfile:
|
||||
# If there's an older file with the same checksum, link it
|
||||
# Otherwise copy it
|
||||
# If linking fails, also copy it
|
||||
#
|
||||
# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
|
||||
#
|
||||
def copy_one_file(fi:FileInfo)->tuple:
|
||||
dst_path = os.path.join(DST_DIR, fi.rel_path)
|
||||
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
||||
|
||||
# Work out target file's UID/GID
|
||||
if CHANGE_GID is not None:
|
||||
new_gid = CHANGE_GID
|
||||
else:
|
||||
new_gid = fi.gid
|
||||
if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
|
||||
new_gid = os.getegid()
|
||||
|
||||
if CHANGE_UID is not None:
|
||||
new_uid = CHANGE_UID
|
||||
else:
|
||||
new_uid = fi.uid
|
||||
if os.geteuid() != 0:
|
||||
new_uid = os.geteuid()
|
||||
|
||||
# Skip existing files
|
||||
if SKIP_EXISTING:
|
||||
try:
|
||||
st = os.stat(dst_path, follow_symlinks=False)
|
||||
if st.st_uid == new_uid and \
|
||||
st.st_gid == new_gid and \
|
||||
st.st_size == fi.size and \
|
||||
st.st_mtime == fi.mtime and \
|
||||
st.st_mode == fi.mode:
|
||||
log_debug('skipping existing %s' % dst_path)
|
||||
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
||||
return (fi.rel_path, dst_path, fi.checksum, st, 0, 0, 1)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Delete destination file if it exists
|
||||
remove_file(dst_path)
|
||||
|
||||
# Regular file: try to link it to a file from an older build
|
||||
if stat.S_ISREG(fi.mode) and fi.checksum != '-':
|
||||
|
||||
# Look up an identical file among the older builds
|
||||
for (old_path, old_st) in find_old_files(fi.checksum):
|
||||
try:
|
||||
log_debug('found link candidate by checksum: %s' % old_path)
|
||||
# Only link old files whose attributes match the source file
|
||||
# except mtime
|
||||
if old_st.st_uid == new_uid and \
|
||||
old_st.st_gid == new_gid and \
|
||||
old_st.st_size == fi.size and \
|
||||
old_st.st_mode == fi.mode:
|
||||
log_debug('link(%s,%s)' % (old_path, dst_path))
|
||||
os.link(old_path, dst_path)
|
||||
dst_stat = os.stat(dst_path, follow_symlinks=False)
|
||||
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
||||
return (fi.rel_path, old_path, fi.checksum, dst_stat, 1, 0, 0)
|
||||
break
|
||||
except OSError as e:
|
||||
log_warn('link(old_path,dst_path): %s' % str(e))
|
||||
|
||||
# Checksum not found, or link failed: copy
|
||||
dst_stat = do_copy(src_path, dst_path, new_uid, new_gid)
|
||||
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
||||
return (fi.rel_path, dst_path, fi.checksum, dst_stat, 0, 1, 0)
|
||||
|
||||
#
|
||||
# Copy files to DST_DIR
|
||||
#
|
||||
# Returns tuple (total_linked, total_copied, total_skipped)
|
||||
#
|
||||
def copy_files(files_count:int)->tuple[int,int]:
|
||||
log_info("copying files, count=%d" % files_count)
|
||||
total_linked = 0
|
||||
total_copied = 0
|
||||
total_skipped = 0
|
||||
with open(os.path.join(TMP_DIR, 'files.index'), 'w') as fh:
|
||||
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'files.list'))
|
||||
for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_file, fi_iter):
|
||||
total_linked += linked
|
||||
total_copied += copied
|
||||
total_skipped += skipped
|
||||
if stat.S_ISREG(st.st_mode):
|
||||
index_line = format_index_line(rel_path, full_path, checksum, st)
|
||||
print('%s' % index_line, file = fh)
|
||||
|
||||
return (total_linked, total_copied, total_skipped)
|
||||
|
||||
#
|
||||
# Re-create a hardlink at destination, ie create a file
|
||||
# as a link to a previously copied file, because it was
|
||||
# linked in SRC_DIR.
|
||||
#
|
||||
# Fall back to copy if link fails.
|
||||
#
|
||||
# Return tuple(REL_PATH, FULL_PATH, CHECKSUM, DST_STAT_RESULT, LINKED, COPIED, SKIPPED)
|
||||
#
|
||||
def copy_one_link(fi:FileInfo)->tuple:
|
||||
dst_path = os.path.join(DST_DIR, fi.rel_path)
|
||||
src_path = os.path.join(SRC_DIR, fi.rel_path)
|
||||
|
||||
# Delete destination file if it exists
|
||||
remove_file(dst_path)
|
||||
|
||||
# Try to link it to a file we previously installed in copy_files()
|
||||
# Find the previously-installed source file in files.list, by dev/ino
|
||||
try:
|
||||
cmd = [ 'look', '%d %d ' % (fi.dev, fi.ino), os.path.join(TMP_DIR, 'files.list') ]
|
||||
log_shell_cmd(shlex.join(cmd))
|
||||
cmd_res = subprocess.run(cmd, check=False, encoding='utf8', stdout=subprocess.PIPE).stdout
|
||||
old_fi = parse_file_info(cmd_res)
|
||||
if old_fi:
|
||||
orig_path = os.path.join(DST_DIR, old_fi.rel_path)
|
||||
log_debug('link(%s,%s)' % (orig_path, dst_path))
|
||||
os.link(orig_path, dst_path)
|
||||
st = os.stat(dst_path, follow_symlinks=False)
|
||||
# (REL_PATH, FULL_PATH, CHECKSUM, STAT_RES, LINKED, COPIED, SKIPPED)
|
||||
return (fi.rel_path, orig_path, old_fi.checksum, st, 1, 0, 0)
|
||||
except OSError as e:
|
||||
log_warn('failed to link %s: %s' % (dst_path, str(e)))
|
||||
|
||||
# Fall back to copy
|
||||
return copy_one_file(fi)
|
||||
|
||||
#
|
||||
# Re-create or copy hardlinks at destination
|
||||
#
|
||||
# Returns tuple (total_linked, total_copied, total_skipped)
|
||||
#
|
||||
def copy_links(links_count:int)->tuple[int,int]:
|
||||
log_info("copying links, count=%d" % links_count)
|
||||
total_linked = 0
|
||||
total_copied = 0
|
||||
total_skipped = 0
|
||||
with open(os.path.join(TMP_DIR, 'links.index'), 'w') as fh:
|
||||
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'links.list'))
|
||||
for (rel_path, full_path, checksum, st, linked, copied, skipped) in map_p(copy_one_link, fi_iter):
|
||||
total_linked += linked
|
||||
total_copied += copied
|
||||
total_skipped += skipped
|
||||
if stat.S_ISREG(st.st_mode):
|
||||
index_line = format_index_line(rel_path, full_path, checksum, st)
|
||||
print('%s' % index_line, file = fh)
|
||||
return (total_linked, total_copied, total_skipped)
|
||||
|
||||
#
|
||||
# Set directory permissions & ownership to how they were at the source
|
||||
#
|
||||
def adjust_one_dir_perms(fi:FileInfo)->FileInfo:
|
||||
path = os.path.join(DST_DIR, fi.rel_path)
|
||||
perms = stat.S_IMODE(fi.mode)
|
||||
log_debug("chmod(%s, 0%o)" % (path, perms))
|
||||
# Don't set follow_symlinks because this function
|
||||
# is never called for symlinks
|
||||
os.chmod(path, perms)
|
||||
|
||||
# At this point target directory exists and is owned
|
||||
# by the current UID:GID due to create_dirs().
|
||||
st = os.stat(path, follow_symlinks=False)
|
||||
|
||||
if CHANGE_GID is not None:
|
||||
new_gid = CHANGE_GID
|
||||
else:
|
||||
new_gid = fi.gid
|
||||
if os.geteuid() != 0 and new_gid not in CURRENT_GID_LIST:
|
||||
new_gid = os.getegid()
|
||||
|
||||
if CHANGE_UID is not None:
|
||||
new_uid = CHANGE_UID
|
||||
else:
|
||||
new_uid = fi.uid
|
||||
if os.geteuid() != 0:
|
||||
new_uid = os.geteuid()
|
||||
|
||||
if new_uid != st.st_uid or new_gid != st.st_gid:
|
||||
log_debug("chown(%s, %d, %d)" % (path, new_uid, new_gid))
|
||||
os.chown(path, new_uid, new_gid, follow_symlinks=False)
|
||||
|
||||
# Set both access time and modification time to modification fime of the
|
||||
# source directory
|
||||
log_debug("utime(%s, (%f, %f))" % (path, fi.mtime, fi.mtime))
|
||||
os.utime(path, (fi.mtime, fi.mtime))
|
||||
|
||||
return fi
|
||||
|
||||
#
|
||||
# Adjust directory permissions & ownership at destination
|
||||
#
|
||||
def adjust_dir_perms(dirs_count:int)->None:
|
||||
log_info("adjusting directory permissions, count=%d" % dirs_count)
|
||||
fi_iter = read_file_info_lines(os.path.join(TMP_DIR, 'dirs.list'))
|
||||
for fi in map_p(adjust_one_dir_perms, fi_iter):
|
||||
pass
|
||||
|
||||
# Save or print "standard" index (StxChecksums) for regfiles and links
|
||||
def save_index(files_count:int, links_count:int)->None:
|
||||
files_index_file = os.path.join(TMP_DIR, 'files.index')
|
||||
links_index_file = os.path.join(TMP_DIR, 'links.index')
|
||||
full_index_file = os.path.join(DST_DIR, 'StxChecksums')
|
||||
log_info('creating index, count=%d' % (files_count + links_count))
|
||||
|
||||
sort_cmd = [ 'sort', '--parallel=%d' % JOBS, '--output=%s' % full_index_file, files_index_file, links_index_file ]
|
||||
|
||||
log_shell_cmd(shlex.join(sort_cmd))
|
||||
subprocess.run(sort_cmd, check=True)
|
||||
|
||||
# Delete temp files
|
||||
def cleanup():
|
||||
if not KEEP_TEMP_FILES:
|
||||
tmp_files = [
|
||||
'dirs.list',
|
||||
'files.index',
|
||||
'files.list',
|
||||
'links.index',
|
||||
'links.list',
|
||||
'old_index.list',
|
||||
]
|
||||
for file in tmp_files:
|
||||
remove_file(os.path.join(TMP_DIR, file))
|
||||
|
||||
# process command line
|
||||
def init()->None:
|
||||
def positive_integer(s:str)->int:
|
||||
v = int(s)
|
||||
if v < 1:
|
||||
raise ValueError()
|
||||
return v
|
||||
def user_id(s:str)->int:
|
||||
try:
|
||||
uid = int(s)
|
||||
except:
|
||||
try:
|
||||
uid = pwd.getpwnam(s).pw_uid
|
||||
except:
|
||||
raise ValueError()
|
||||
if uid < 0:
|
||||
raise ValueError
|
||||
return uid
|
||||
def group_id(s:str)->int:
|
||||
try:
|
||||
uid = int(s)
|
||||
except:
|
||||
try:
|
||||
uid = grp.getgrnam(s).gr_gid
|
||||
except:
|
||||
raise ValueError()
|
||||
if uid < 0:
|
||||
raise ValueError
|
||||
return uid
|
||||
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('-j', '--jobs', type=positive_integer, default=1)
|
||||
p.add_argument('--owner', type=user_id)
|
||||
p.add_argument('--group', type=group_id)
|
||||
p.add_argument('--checksum-hardlink', action='store_true', default=False)
|
||||
p.add_argument('--old-index-files-from')
|
||||
p.add_argument('--output-checksums')
|
||||
p.add_argument('--skip-existing', action='store_true', default=False)
|
||||
p.add_argument('-v', '--verbose', action='count', default=0, dest='verbosity')
|
||||
p.add_argument('--reflink', action='store_true', default=False)
|
||||
p.add_argument('--keep-temp-files', action='store_true', default=False)
|
||||
p.add_argument('SRC_DIR')
|
||||
p.add_argument('DST_DIR')
|
||||
p.add_argument('TMP_DIR')
|
||||
p.add_argument('old_index_files', nargs='*')
|
||||
p.format_help = lambda: HELP
|
||||
args = p.parse_args()
|
||||
|
||||
current_gid_list = [ os.getegid(), *os.getgroups() ]
|
||||
if args.owner is not None:
|
||||
if os.geteuid() != 0 and args.owner != os.geteuid():
|
||||
log_error('--owner can only be changed by root')
|
||||
sys.exit(1)
|
||||
if args.group is not None:
|
||||
if os.geteuid() != 0 and args.group not in current_gid_list:
|
||||
log_error('--group can only be changed by root; or it must be a group you are a member of')
|
||||
sys.exit(1)
|
||||
|
||||
existing_old_index_files = []
|
||||
if args.checksum_hardlink:
|
||||
old_index_files = []
|
||||
old_index_files += args.old_index_files
|
||||
if args.old_index_files_from:
|
||||
with open(args.old_index_files_from) as fh:
|
||||
for filename in fh:
|
||||
filename = filename.rstrip()
|
||||
old_index_files.append(filename)
|
||||
# Ignore missing/non-readable files because they may disappear
|
||||
# while this script is running
|
||||
for filename in old_index_files:
|
||||
try:
|
||||
with open(filename) as ref_fh:
|
||||
existing_old_index_files.append(filename)
|
||||
except OSError as x:
|
||||
log_warn('Ignoring index file %s: %s' % (filename, str(x)))
|
||||
elif args.old_index_files:
|
||||
log_warn('old index files are meaningless without --checksum-hardlink')
|
||||
|
||||
global JOBS, CHANGE_UID, CHANGE_GID, CURRENT_GID_LIST
|
||||
global VERBOSITY, COPY_REFLINK, SRC_DIR, DST_DIR, TMP_DIR
|
||||
global OLD_INDEX_FILES, OUTPUT_INDEX_FILE
|
||||
global KEEP_TEMP_FILES, SKIP_EXISTING
|
||||
JOBS = args.jobs
|
||||
CHANGE_UID = args.owner
|
||||
CHANGE_GID = args.group
|
||||
CURRENT_GID_LIST = current_gid_list
|
||||
VERBOSITY = args.verbosity
|
||||
COPY_REFLINK = args.reflink
|
||||
SRC_DIR = str(Path(args.SRC_DIR).absolute())
|
||||
DST_DIR = str(Path(args.DST_DIR).absolute())
|
||||
TMP_DIR = str(Path(args.TMP_DIR).absolute())
|
||||
OLD_INDEX_FILES = existing_old_index_files
|
||||
OUTPUT_INDEX_FILE = args.output_checksums
|
||||
SKIP_EXISTING = args.skip_existing
|
||||
KEEP_TEMP_FILES = args.keep_temp_files
|
||||
|
||||
init()
|
||||
|
||||
log_debug('SRC_DIR=%s' % SRC_DIR)
|
||||
log_debug('DST_DIR=%s' % DST_DIR)
|
||||
log_debug('TMP_DIR=%s' % TMP_DIR)
|
||||
log_debug('JOBS=%d' % JOBS)
|
||||
if CHANGE_UID:
|
||||
log_debug('CHANGE_UID=%d' % CHANGE_UID)
|
||||
if CHANGE_GID:
|
||||
log_debug('CHANGE_GID=%d' % CHANGE_GID)
|
||||
log_debug('OLD_INDEX_FILES=%s' % OLD_INDEX_FILES)
|
||||
log_debug('KEEP_TEMP_FILES=%d' % KEEP_TEMP_FILES)
|
||||
|
||||
if not os.path.isdir(TMP_DIR):
|
||||
os.mkdir(TMP_DIR)
|
||||
(dirs_count, files_count, links_count) = find_files()
|
||||
calc_checksums(files_count)
|
||||
create_dirs(dirs_count)
|
||||
combine_old_index_files() # DST_DIR must already exist
|
||||
(linked1, copied1, skipped1) = copy_files(files_count)
|
||||
(linked2, copied2, skipped2) = copy_links(links_count)
|
||||
adjust_dir_perms(dirs_count)
|
||||
save_index(files_count, links_count)
|
||||
cleanup()
|
||||
log_info('%s linked=%d copied=%d skipped=%d' % (DST_DIR, linked1+linked2, copied1+copied2, skipped1+skipped2))
|
@@ -1,516 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
PROGNAME="${BASH_SOURCE[0]##*/}"
|
||||
SRC_DIR=
|
||||
DST_DIR=
|
||||
CHECKSUM_FILES_LIST_FILE=
|
||||
DST_CHECKSUMS_FILE=
|
||||
CHANGE_OWNER=
|
||||
CHANGE_GROUP=
|
||||
JOBS=1
|
||||
XTRACE=0
|
||||
|
||||
usage() {
|
||||
echo -n "\
|
||||
Usage: $0 [OPTIONS...] SRC_DIR DST_DIR TMP_DIR
|
||||
|
||||
Archive SRC_DIR in DST_DIR, using TMP_DIR for temporary files.
|
||||
|
||||
-j,--jobs=N calculate checksums in parallel (default: 1)
|
||||
--owner=OWNER set copied file's owner as specified
|
||||
--group=GROUP set copied file's group as specified
|
||||
|
||||
--output-checksums=CK_FILE
|
||||
save StxChecksums to this file; by default print it to
|
||||
STDOUT
|
||||
|
||||
--checksum-hardlink=CK_LIST_FILE
|
||||
Hardlink destination files if possible. CK_LIST_FILE
|
||||
must contain a list of existing StxChecksums file names
|
||||
from previously-archived directories, one per line.
|
||||
We will use the files with matching properties & checksums
|
||||
to create hard links in DST_DIR.
|
||||
|
||||
--xtrace Enable debug output
|
||||
|
||||
If executed by root, we will preserve owners/groups of the copied files,
|
||||
unless they are overridden on the command line.
|
||||
|
||||
If this script is called by non-root, it will create all files with the
|
||||
calling user's effective user & group ownership.
|
||||
|
||||
"
|
||||
exit 0
|
||||
}
|
||||
|
||||
cmdline_error() {
|
||||
if [[ "$#" -gt 0 ]] ; then
|
||||
echo "ERROR:" "$@" >&2;
|
||||
fi
|
||||
echo "Type \`$0 --help' for more info" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
check_pipe_status() {
|
||||
local -a pipestatus=(${PIPESTATUS[*]})
|
||||
local -i i
|
||||
for ((i=0; i<${#pipestatus[*]}; ++i)) ; do
|
||||
[[ "${pipestatus[$i]}" -eq 0 ]] || return 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
# Process command line
|
||||
temp=$(getopt -o h,j: --long help,jobs:,owner:,group:,output-checksums:,checksum-hardlink:,xtrace -n "$PROGNAME" -- "$@") || cmdline_error
|
||||
eval set -- "$temp"
|
||||
while [[ "$#" -gt 0 ]] ; do
|
||||
case "$1" in
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-j|--jobs)
|
||||
JOBS="$2"
|
||||
if [[ ! "$JOBS" =~ ^[0-9]{1,2}$ || "$JOBS" -le 0 || "$JOBS" -ge 99 ]] ; then
|
||||
cmdline_error "$1 must be an integer [1.99]"
|
||||
fi
|
||||
shift 2
|
||||
;;
|
||||
--owner)
|
||||
CHANGE_OWNER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--group)
|
||||
CHANGE_GROUP="$2"
|
||||
shift 2
|
||||
;;
|
||||
--checksum-hardlink)
|
||||
CHECKSUM_FILES_LIST_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output-checksums)
|
||||
DST_CHECKSUMS_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--xtrace)
|
||||
XTRACE=1
|
||||
shift
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
*)
|
||||
cmdline_error
|
||||
;;
|
||||
esac
|
||||
done
|
||||
[[ "$#" -ge 3 ]] || cmdline_error "not enough arguments"
|
||||
[[ "$#" -le 3 ]] || cmdline_error "too many arguments"
|
||||
SRC_DIR="$1"
|
||||
DST_DIR="$2"
|
||||
TMP_DIR="$3"
|
||||
|
||||
if [[ ! "$EGID" ]] ; then
|
||||
EGID="$(id -g)" || exit 1
|
||||
fi
|
||||
|
||||
if [[ $XTRACE -eq 1 ]] ; then
|
||||
set -x
|
||||
fi
|
||||
|
||||
# Make sure BSD look is installed
|
||||
if ! look --help >/dev/null ; then
|
||||
echo "This script requires \"look\" to be installed" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for GNU parallel
|
||||
if parallel --help >/dev/null 2>&1 ; then
|
||||
GNU_PARALLEL_EXISTS=1
|
||||
else
|
||||
GNU_PARALLEL_EXISTS=0
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
#
|
||||
# Combine checksum list files into one
|
||||
#
|
||||
if [[ "$CHECKSUM_FILES_LIST_FILE" ]] ; then
|
||||
echo $'\n## Combining checksum lists into one' >&2
|
||||
combined_checksums_file="$TMP_DIR/combined_checksums.list"
|
||||
while read -r checksums_file ; do
|
||||
# skip empty lines and comments
|
||||
if echo "$checksums_file" | grep -E '^\s*(#.*)$' ; then
|
||||
continue
|
||||
fi
|
||||
# skip missing files
|
||||
[[ -f "$checksums_file" ]] || continue
|
||||
# add file path to the second token (file name)
|
||||
checksums_dir="$(dirname "$checksums_file")"
|
||||
awk -v "DIR=$checksums_dir/" '{ if (match($0, /^[[:space:]]*[^[:space:]]+[[:space:]]+/) >= 0) print substr($0, 1, RLENGTH) DIR substr($0, RLENGTH+1) }' \
|
||||
"$checksums_file"
|
||||
done <"$CHECKSUM_FILES_LIST_FILE" | sort >"$combined_checksums_file"
|
||||
check_pipe_status
|
||||
fi
|
||||
|
||||
#
|
||||
# Create source file lists
|
||||
#
|
||||
|
||||
# Cretate a list file with each source file or dir + their stat properties
|
||||
echo $'\n## Compiling file list: '"$SRC_DIR" >&2
|
||||
full_list_file="$TMP_DIR/full.list"
|
||||
( cd "$SRC_DIR" && find -printf 'type=%y owner=%U group=%G mode=%#m size=%s mtime=%T@ name=%p\n' ) \
|
||||
| sed 's#name=[.]/#name=#' \
|
||||
| sed 's#\(mtime=[0-9]\+\)[.][0-9]\+#\1#g' \
|
||||
>"${full_list_file}"
|
||||
check_pipe_status
|
||||
|
||||
# Create another list file that contains only regular files
|
||||
regfile_list_file="$TMP_DIR/regfile.list"
|
||||
\grep '^type=f' "$full_list_file" | sort -k 7 >"$regfile_list_file" || exit 1
|
||||
|
||||
# Create a list file that contains only directories
|
||||
# Sort by the last field "name=..."
|
||||
dir_list_file="$TMP_DIR/dir.list"
|
||||
\grep '^type=d' "$full_list_file" | sort -k 7 >"$dir_list_file" || exit 1
|
||||
|
||||
# Create a list file that contains all other entries (non-dirs & non-files)
|
||||
other_list_file="$TMP_DIR/other.list"
|
||||
\grep '^type=[^df]' "$full_list_file" | sort -k 7 >"$other_list_file" || exit 1
|
||||
|
||||
|
||||
#
|
||||
# Usage: process_lines MESSAGE INPUT_FILE FUNC ARGS...
|
||||
#
|
||||
# Call shell function FUNC in parallel, similar to xargs.
|
||||
# We will read lines from INPUT_FILE, then pass some subset of lines
|
||||
# to FUNC many times in parallel, until all lines have been processed.
|
||||
# Input lines will be appended as additional arguments to FUNC calls.
|
||||
#
|
||||
# FUNC and any global vars it references must be exported before
|
||||
# calling process_lines().
|
||||
#
|
||||
# MESSAGE will be printed to STDERR before starting
|
||||
#
|
||||
process_lines() {
|
||||
|
||||
local message="$1" ; shift
|
||||
local input_file="$1" ; shift
|
||||
|
||||
# how many input lines? bail out if 0
|
||||
local line_count
|
||||
line_count="$(cat "$input_file" | wc -l)" || exit 1
|
||||
[[ "$line_count" -gt 0 ]] || return 0
|
||||
|
||||
# How many lines to process at a time. The more the better, but with too
|
||||
# many some child jobs may starve -- cap it at 256
|
||||
local lines_per_job
|
||||
if [[ "$JOBS" -gt 1 ]] ; then
|
||||
let lines_per_job="line_count / JOBS / 2"
|
||||
if [[ "$lines_per_job" -eq 0 ]] ; then
|
||||
lines_per_job=1
|
||||
elif [[ "$lines_per_job" -gt 256 ]] ; then
|
||||
lines_per_job=256
|
||||
fi
|
||||
else
|
||||
lines_per_job=256
|
||||
fi
|
||||
|
||||
echo "** $message [JOBS=$JOBS lines_per_job=$lines_per_job]" >&2
|
||||
|
||||
# Prefer GNU parallel because it can exit early
|
||||
local -a cmd
|
||||
if [[ $GNU_PARALLEL_EXISTS -eq 1 ]] ; then
|
||||
cmd=(parallel --halt now,fail=1 -q -r -d '\n' -n $lines_per_job -P $JOBS "$@")
|
||||
else
|
||||
cmd=(xargs -r -d '\n' -n $lines_per_job -P $JOBS $SHELL -c '"$@"' unused_arg "$@")
|
||||
fi
|
||||
if ! "${cmd[@]}" <"$input_file" ; then
|
||||
echo "ERROR: command failed (\"$message\")" >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# create directories in sort order, ie create parents before
|
||||
# children
|
||||
#
|
||||
echo $'\n## Creating directories: '"$DST_DIR" >&2
|
||||
while read -r line ; do
|
||||
[[ -n "$line" ]] || continue
|
||||
name="${line#* name=}"
|
||||
[[ -n "$name" ]] || continue
|
||||
attr_line="${line% name=*}"
|
||||
mode="$(echo "$attr_line" | sed -n -r 's#.*mode=([0-9]+).*#\1#p')"
|
||||
install_args=()
|
||||
if [[ "$CHANGE_OWNER" ]] ; then
|
||||
install_args+=("--owner" "$CHANGE_OWNER")
|
||||
elif [[ $EUID -eq 0 ]] ; then
|
||||
owner="$(echo "$attr_line" | sed -n -r 's#.*owner=([0-9]+).*#\1#p')"
|
||||
install_args+=("--owner" "$owner")
|
||||
fi
|
||||
if [[ "$CHANGE_GROUP" ]] ; then
|
||||
install_args+=("--group" "$CHANGE_GROUP")
|
||||
elif [[ $EUID -eq 0 ]] ; then
|
||||
group="$(echo "$attr_line" | sed -n -r 's#.*group=([0-9]+).*#\1#p')"
|
||||
install_args+=("--group" "$group")
|
||||
fi
|
||||
echo " MKDIR $name" >&2
|
||||
if [[ -e "$DST_DIR/$name" && ! -d "$DST_DIR/$name" ]] ; then
|
||||
\rm "$DST_DIR/$name" || exit 1
|
||||
fi
|
||||
install -d "${install_args[@]}" "$DST_DIR/$name"
|
||||
done <"$dir_list_file" || exit 1
|
||||
|
||||
#
|
||||
# Copy or hardlink regular files
|
||||
#
|
||||
echo $'\n## Copying regular files: '"$SRC_DIR" >&2
|
||||
|
||||
# helper function to process regular files
|
||||
# global vars used:
|
||||
# SRC_DIR
|
||||
# DST_DIR
|
||||
# CHANGE_OWNER
|
||||
# CHANGE_GROUP
|
||||
# EUID (always definedby bash)
|
||||
# EGID
|
||||
# TMP_DIR
|
||||
# XTRACE
|
||||
# combined_checksums_file
|
||||
process_regfiles() {
|
||||
if [[ $XTRACE -eq 1 ]] ; then
|
||||
set -x
|
||||
fi
|
||||
|
||||
# Temp file generated by this function. Its name must be unique to
|
||||
# prevent interference from other jobs with -j N.
|
||||
local matching_checksums_file
|
||||
matching_checksums_file="$TMP_DIR/matching_checksums-$$.list"
|
||||
|
||||
local line attr_line
|
||||
for line in "$@" ; do
|
||||
|
||||
# source file name relative to SRC_DIR
|
||||
local name
|
||||
name="${line#* name=}"
|
||||
[[ "$name" ]] || continue
|
||||
|
||||
# all attributes leading up to name=
|
||||
attr_line="${line% name=*}"
|
||||
|
||||
# source checksum
|
||||
local checksum
|
||||
#flock -s "$DST_DIR" echo " SHA256 $name" >&2
|
||||
checksum="$(sha256sum "$SRC_DIR/$name" | awk '{print $1}')"
|
||||
if [[ ! "$checksum" ]] ; then
|
||||
flock -s "$DST_DIR" echo "$SRC_DIR/$name: failed to calculate checksum" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# source owner; or a user-provided override
|
||||
local -a install_args=()
|
||||
local owner
|
||||
if [[ "$CHANGE_OWNER" ]] ; then
|
||||
owner="$CHANGE_OWNER"
|
||||
install_args+=("--owner" "$owner")
|
||||
elif [[ $EUID -eq 0 ]] ; then
|
||||
owner="$(echo "$attr_line" | sed -n -r 's#.* owner=([0-9]+).*#\1#p')"
|
||||
install_args+=("--owner" "$owner")
|
||||
else
|
||||
owner=$EUID
|
||||
fi
|
||||
|
||||
# source group; or a user-provided override
|
||||
local group
|
||||
if [[ "$CHANGE_GROUP" ]] ; then
|
||||
group="$CHANGE_GROUP"
|
||||
install_args+=("--group" "$group")
|
||||
elif [[ $EGID -eq 0 ]] ; then
|
||||
group="$(echo "$attr_line" | sed -n -r 's#.* group=([0-9]+).*#\1#p')"
|
||||
install_args+=("--group" "$group")
|
||||
else
|
||||
group=$EGID
|
||||
fi
|
||||
|
||||
# source file's mode/permissions
|
||||
local mode
|
||||
mode="$(echo "$attr_line" | sed -n -r 's#.* mode=([^[:space:]]+).*#\1#p')"
|
||||
|
||||
# Search for the checksum in an older StxChecksums file
|
||||
if [[ "$combined_checksums_file" ]] ; then
|
||||
if look "$checksum " "$combined_checksums_file" >"$matching_checksums_file" 2>/dev/null ; then
|
||||
(
|
||||
# As we read previosuly-archived files properties from StxChecksums,
|
||||
# make sure they have not changed compared to the actual files on disk.
|
||||
while read -r ref_checksum ref_name ref_size ref_mtime ref_dev ref_inode ref_path x_rest ; do
|
||||
[[ -f "$ref_path" ]] || continue
|
||||
# read on-disk file properties
|
||||
local ref_stat
|
||||
ref_stat=($(stat -c '%s %Y %u %g %#04a' "$ref_path" || true))
|
||||
[[ "${#ref_stat[@]}" -eq 5 ]] || continue
|
||||
|
||||
# on-disk size does not match StxChecksums
|
||||
local ref_ondisk_size
|
||||
ref_ondisk_size="${ref_stat[0]}"
|
||||
[[ "$ref_size" == "$ref_ondisk_size" ]] || continue
|
||||
|
||||
# on-disk mtime does not match StxChecksums
|
||||
local ref_ondisk_mtime
|
||||
ref_ondisk_mtime="${ref_stat[1]}"
|
||||
[[ "${ref_mtime}" == "$ref_ondisk_mtime" ]] || continue
|
||||
|
||||
# on-disk owner does not match requested owner
|
||||
local ref_ondisk_owner
|
||||
ref_ondisk_owner="${ref_stat[2]}"
|
||||
[[ "${owner}" == "$ref_ondisk_owner" ]] || continue
|
||||
|
||||
# on-disk group does not match requested group
|
||||
local ref_ondisk_group
|
||||
ref_ondisk_group="${ref_stat[3]}"
|
||||
[[ "${group}" == "$ref_ondisk_group" ]] || continue
|
||||
|
||||
# on-disk mode does not match the mode of the source file
|
||||
ref_ondisk_mode="${ref_stat[4]}"
|
||||
[[ "${mode}" == "$ref_ondisk_mode" ]] || continue
|
||||
|
||||
# At this point checksum, size, mtime, mode, owner, group and checksums of the
|
||||
# exsiting file match with the file we are trying to copy.
|
||||
# Use that file to create a hardlink.
|
||||
flock -s "$DST_DIR" echo " LINK $name (from $ref_name)" >&2
|
||||
if ln -f "$ref_name" "${DST_DIR}/$name" ; then
|
||||
flock -s "$DST_DIR" echo "$checksum $name $ref_size $ref_mtime $ref_dev $ref_inode $DST_DIR/$name"
|
||||
exit 0
|
||||
fi
|
||||
done <"$matching_checksums_file"
|
||||
# checksum not found in older archives
|
||||
exit 1
|
||||
) && continue || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# No matching files found: really copy it
|
||||
|
||||
if [[ -e "$DST_DIR/$name" ]] ; then
|
||||
\rm "$DST_DIR/$name" || exit 1
|
||||
fi
|
||||
|
||||
# source file's size & mtime
|
||||
local size mtime
|
||||
size="$(echo "$attr_line" | sed -n -r 's#.* size=([^[:space:]]+).*#\1#p')"
|
||||
mtime="$(echo "$attr_line" | sed -n -r 's#.* mtime=([^[:space:]]+).*#\1#p')"
|
||||
|
||||
# copy it to $DST_DIR
|
||||
flock -s "$DST_DIR" echo " COPY $name" >&2
|
||||
rm -f "$DST_DIR/$name" || exit 1
|
||||
install --preserve-timestamps "${install_args[@]}" --mode="$mode" -T "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
|
||||
|
||||
# check destination file properties
|
||||
local dst_stat dst_size dst_dev dst_ino
|
||||
dst_stat=($(stat -c '%s %d %i' "$DST_DIR/$name")) || exit 1
|
||||
dst_size="${dst_stat[0]}"
|
||||
dst_dev="${dst_stat[1]}"
|
||||
dst_ino="${dst_stat[2]}"
|
||||
|
||||
# file changed while copying
|
||||
if [[ "$dst_size" != "$size" ]] ; then
|
||||
flock -s "$DST_DIR" echo "ERROR: $SRC_DIR/$name changed while copying!" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# print out a line for StxChecksums using source file properties (preserved
|
||||
# during copying), but with destination file's dev & ino.
|
||||
flock -s "$DST_DIR" echo "$checksum $name $size $mtime $dst_dev $dst_ino $DST_DIR/$name"
|
||||
done
|
||||
|
||||
rm -f "$matching_checksums_file"
|
||||
}
|
||||
|
||||
# process files in parallel
|
||||
(
|
||||
if [[ "$DST_CHECKSUMS_FILE" ]] ; then
|
||||
dst_checksums_fd=5
|
||||
exec 5<>"$DST_CHECKSUMS_FILE" || exit 1
|
||||
else
|
||||
dst_checksums_fd=1
|
||||
fi
|
||||
|
||||
export SRC_DIR \
|
||||
DST_DIR \
|
||||
CHANGE_OWNER \
|
||||
CHANGE_GROUP \
|
||||
EGID \
|
||||
TMP_DIR \
|
||||
XTRACE \
|
||||
combined_checksums_file
|
||||
|
||||
export -f process_regfiles
|
||||
|
||||
message="processing regular files"
|
||||
process_lines "$message" "$regfile_list_file" process_regfiles | sort >&$dst_checksums_fd
|
||||
[[ "${PIPESTATUS[0]}" -eq 0 && "${PIPESTATUS[1]}" -eq 0 ]] || exit 1
|
||||
) || exit 1
|
||||
|
||||
|
||||
#
|
||||
# copy special files
|
||||
#
|
||||
echo $'\n## Copying special files: '"$DST_DIR" >&2
|
||||
|
||||
# helper function for processing special files
|
||||
# global vars used:
|
||||
# SRC_DIR
|
||||
# DST_DIR
|
||||
# CHANGE_OWNER
|
||||
# CHANGE_GROUP
|
||||
# XTRACE
|
||||
process_other() {
|
||||
if [[ $XTRACE -eq 1 ]] ; then
|
||||
set -x
|
||||
fi
|
||||
local line attr_line
|
||||
for line in "$@" ; do
|
||||
local name
|
||||
name="${line#* name=}"
|
||||
[[ -n "$name" ]] || continue
|
||||
attr_line="${line% name=*}"
|
||||
|
||||
local type
|
||||
type="$(echo "$attr_line" | sed 's#^type=\(.\) .*#\1#g')"
|
||||
[[ -n "$type" ]] || continue
|
||||
|
||||
flock -s "$DST_DIR" echo " CREATE type=$type $name" >&2
|
||||
if [[ -e "$DST_DIR/$name" ]] ; then
|
||||
rm "$DST_DIR/$name" || exit 1
|
||||
fi
|
||||
cp -a --no-dereference "$SRC_DIR/$name" "$DST_DIR/$name" || exit 1
|
||||
if [[ "$CHANGE_OWNER" || "$CHANGE_GROUP" ]] ; then
|
||||
local chown_arg=
|
||||
if [[ "$CHANGE_OWNER" ]] ; then
|
||||
chown_arg="$CHANGE_OWNER"
|
||||
fi
|
||||
if [[ "$CHANGE_GROUP" ]] ; then
|
||||
chown_arg+=":$CHANGE_GROUP"
|
||||
fi
|
||||
chown --no-dereference "$chown_arg" "$DST_DIR/$name" || exit 1
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# process them in parallel
|
||||
(
|
||||
export SRC_DIR \
|
||||
DST_DIR \
|
||||
CHANGE_OWNER \
|
||||
CHANGE_GROUP \
|
||||
XTRACE
|
||||
|
||||
export -f process_other
|
||||
|
||||
message="processing other files"
|
||||
process_lines "$message" "$other_list_file" process_other || exit 1
|
||||
) || exit 1
|
@@ -35,7 +35,7 @@ export REPO_TRACE=0
|
||||
|
||||
# docker images
|
||||
SAFE_RSYNC_DOCKER_IMG="servercontainers/rsync:3.1.3"
|
||||
COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20230529"
|
||||
COREUTILS_DOCKER_IMG="starlingx/jenkins-pipelines-coreutils:20250709"
|
||||
|
||||
notice() {
|
||||
( set +x ; print_log -i --notice "$@" ; )
|
||||
|
Reference in New Issue
Block a user