Coverage for enderchest/sync/rsync.py: 87%
138 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-04 01:41 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-04 01:41 +0000
1"""rsync sync implementation. Relies on the user having rsync installed on their system"""
3import os.path
4import re
5import shutil
6import subprocess
7from collections import defaultdict
8from pathlib import Path
9from typing import Iterable
10from urllib.parse import ParseResult, unquote
12from . import SYNC_LOGGER, get_default_netloc, uri_to_ssh
14RSYNC = shutil.which("rsync")
15if RSYNC is None: # pragma: no cover
16 raise RuntimeError(
17 "No rsync executable found on your system. Cannot sync using this protocol."
18 )
21def _get_rsync_version() -> tuple[int, int]:
22 """Determine the installed version of Rsync
24 Returns
25 -------
26 int
27 The major version of the resolved Rsync executable
28 int
29 The minor version of the resolved Rsync executable
31 Raises
32 -----
33 RuntimeError
34 If Rsync is not installed, if `rsync --version` returns an error or if
35 the version information cannot be decoded from the `rsync --version`
36 output
37 """
38 try:
39 result = subprocess.run(
40 ["rsync", "--version"],
41 stdout=subprocess.PIPE,
42 stderr=subprocess.PIPE,
43 check=False,
44 )
45 if result.stderr: # TODO: #124 just use check=True
46 raise RuntimeError(result.stderr.decode("utf-8"))
48 head = result.stdout.decode("utf-8").splitlines()[0]
49 except (FileNotFoundError, IndexError):
50 raise RuntimeError("Rsync is not installed or could not be executed.")
52 try:
53 if match := re.match(
54 r"^rsync[\s]+version ([0-9]+).([0-9]+).([0-9]+)",
55 head,
56 ):
57 major, minor, *_ = match.groups()
58 return int(major), int(minor)
59 raise AssertionError
60 except (AssertionError, ValueError):
61 raise RuntimeError(f"Could not parse version output:\n{head}")
64rsync_version = _get_rsync_version()
65if rsync_version < (3, 2):
66 raise RuntimeError(
67 "EnderChest requires Rsync 3.2 or newer."
68 " The version detected on your system is {}.{}".format(*rsync_version)
69 )
72def run_rsync(
73 working_directory: Path,
74 source: str,
75 destination_folder: str,
76 delete: bool,
77 dry_run: bool,
78 exclude: Iterable[str],
79 *additional_args: str,
80 timeout: int | None = None,
81 verbosity: int = 0,
82 rsync_flags: str | None = None,
83) -> None:
84 """Run an operation with rsync
86 Parameters
87 ----------
88 working_directory : Path
89 The working directory to run the sync command from
90 source : str
91 The source file or folder to sync, specified as either a URI string,
92 an ssh address or a path relative to the working directory
93 destination_folder : str
94 The destination folder where the file or folder should be synced to,
95 with the same formats available as for source
96 delete : bool
97 Whether part of the syncing should include deleting files at the destination
98 that aren't at the source
99 dry_run : bool
100 Whether to only simulate this sync (report the operations to be performed
101 but not actually perform them)
102 exclude : list of str
103 Any patterns that should be excluded from the sync (and sync)
104 *additional_args : str
105 Any additional arguments to pass into the rsync command
106 timeout : int, optional
107 The number of seconds to wait before timing out the sync operation.
108 If None is provided, no explicit timeout value will be set.
109 verbosity : int
110 A modifier for how much info to output either to stdout or the INFO-level
111 logs. At...
113 - verbosity = -2 : No information will be printed, even on dry runs
114 - verbosity = -1 : The sync itself will be silent. Dry runs will only
115 report the sync statistics.
116 - verbosity = 0 : Actual syncs will display a progress bar. Dry run
117 reports will summarize the changes to each shulker
118 box in addition to reporting the sync statistics .
119 - verbosity = 1 : Actual syncs will report the progress of each file
120 transfer. Dry runs will report on each file to
121 be created, updated or deleted.
122 - verbosity = 2 : Dry runs and syncs will print or log the output
123 of rsync run using the `-vv` modifier
125 Verbosity values outside of this range will simply be capped / floored
126 to [-2, 2].
127 rsync_flags : str, optional
128 By default, rsync will be run using the flags "shaz" which means:
130 - no space splitting
131 - use output (file sizes, mostly) human-readable
132 - archive mode (see: https://www.baeldung.com/linux/rsync-archive-mode)
133 - compress data during transfer
135 Advanced users may choose to override these options, but **you do so
136 at your own peril**.
138 Raises
139 ------
140 TimeoutError
141 If the rsync operation times out before completion
142 RuntimeError
143 If the rsync operation fails for any other reason
145 Notes
146 -----
147 This method does not perform any validation or normalization of the source,
148 destination, exclude-list, additional arguments or rsync options.
149 """
150 rsync_flags = rsync_flags or "shaz"
152 args: list[str] = [RSYNC, f"-{rsync_flags}"] # type: ignore[list-item]
153 if delete:
154 args.append("--delete")
155 if dry_run:
156 args.extend(("--dry-run", "--stats"))
157 if verbosity < 1:
158 # at 1+ we don't need it to be machine-parseable
159 args.append("--out-format=%i %n")
160 else:
161 if verbosity >= 0:
162 args.append("--stats")
163 if verbosity == 0:
164 args.append("--info=progress2")
165 if verbosity >= 1:
166 args.append("--progress")
167 if verbosity > 0:
168 args.append("-" + "v" * verbosity)
170 for pattern in exclude:
171 args.extend(("--exclude", pattern))
172 args.extend(additional_args)
173 args.extend((source, destination_folder))
175 SYNC_LOGGER.debug(
176 "Executing the following command:\n %s",
177 " ".join(args),
178 )
180 with subprocess.Popen(
181 args,
182 stdout=subprocess.PIPE if dry_run else None,
183 stderr=subprocess.PIPE,
184 cwd=working_directory,
185 ) as proc:
186 if timeout:
187 try:
188 proc.wait(timeout)
189 except subprocess.TimeoutExpired as times_up:
190 proc.kill()
191 if proc.stdout is not None:
192 if output_log := proc.stdout.read().decode("UTF-8"):
193 SYNC_LOGGER.warning(output_log)
194 if proc.stderr is not None:
195 if error_log := proc.stderr.read().decode("UTF-8"):
196 SYNC_LOGGER.error(error_log)
197 raise TimeoutError("Timeout reached.") from times_up
199 if proc.stdout is not None:
200 if output_log := proc.stdout.read().decode("UTF-8"):
201 if verbosity > 0:
202 dry_run_output = output_log.splitlines()
203 else:
204 dry_run_output = summarize_rsync_report(output_log)
205 SYNC_LOGGER.info("\nSUMMARY\n-------")
206 for line in dry_run_output:
207 if _is_important_stats_line(line):
208 SYNC_LOGGER.log(25, line)
209 else:
210 SYNC_LOGGER.debug(line)
212 if proc.stderr is not None:
213 if error_log := proc.stderr.read().decode("UTF-8"):
214 if "No such file or directory" in error_log:
215 raise FileNotFoundError(error_log)
216 raise RuntimeError(error_log) # pragma: no cover
219def summarize_rsync_report(raw_output: str, depth: int = 2) -> list[str]:
220 """Take the captured output from running
221 `rsync -ha --out-format="%i %n"`
222 and report a high-level summary to the logging.INFO level
224 Parameters
225 ----------
226 raw_output : str
227 The raw output captured from running the rsync command
228 depth : int, optional
229 How many directories to go down from the root to generate the summary.
230 Default is 2 (just report on top-level files and folders within the
231 source folder).
233 Returns
234 -------
235 list of str
236 Any lines that weren't part of the rsync report (and were probably
237 part of `--stats`?)
239 Notes
240 -----
241 The rsync man page (https://linux.die.net/man/1/rsync) describes the output
242 format as... "cryptic," which I find rather charitable. The relevant bits
243 are that `--out-format="%i %n"` produces:
244 - `%i` : a string of 11 characters that gives various metadata about the file
245 transfer operation (is it a file, a directory or a link? Is it being
246 sent or received? Created, updated or deleted?)
247 - `%n`: the path of the file (or whatever), unquoted, un-escaped
248 """
249 summary: dict[str, dict[str, int] | str] = defaultdict(
250 lambda: {"create": 0, "update": 0, "delete": 0}
251 )
252 stats: list[str] = []
253 for line in raw_output.splitlines():
254 if line == "": # skip empty lines
255 continue
257 info = line.split()[0]
258 full_path = os.path.normpath(" ".join(line.split()[1:]))
259 path_key = os.sep.join(full_path.split(os.sep)[:depth])
261 if info.startswith("*deleting"):
262 if full_path == path_key:
263 summary[path_key] = "delete"
264 else:
265 entry = summary[path_key]
266 if not isinstance(entry, str):
267 entry["delete"] += 1
268 # otherwise the whole thing is being deleted
269 elif info[2:5] == "+++": # this is a creation
270 if full_path == path_key:
271 summary[path_key] = "create"
272 else:
273 if info[1] != "d": # don't count directories
274 entry = summary[path_key]
275 if isinstance(entry, str):
276 # then this is described by the top-level op
277 pass
278 else:
279 entry["create"] += 1
280 # otherwise the whole key is being created
281 elif info[:2] in ("<f", ">f"): # file transfer
282 # and remember that creates were caught above, so this must be an update
283 if full_path == path_key:
284 summary[path_key] = "update"
285 else:
286 entry = summary[path_key]
287 if isinstance(entry, str): # pragma: no cover
288 # this should never happen, but still
289 pass
290 else:
291 entry["update"] += 1
292 elif info[:2] == "cL": # this is replacing a link, as far as I can tell
293 if full_path == path_key:
294 summary[path_key] = "update"
295 else:
296 entry = summary[path_key]
297 if isinstance(entry, str): # pragma: no cover
298 # this should never happen, but still
299 pass
300 else:
301 entry["update"] += 1
302 elif info[:1] == ".": # pragma: no cover
303 # this just means permissions or dates are being updated or something
304 pass
305 else: # then hopefully this is part of the stats report
306 stats.append(line)
307 continue
309 SYNC_LOGGER.debug(line)
311 for path_key, report in sorted(summary.items()):
312 if isinstance(report, str):
313 # nice that these verbs follow the same pattern
314 SYNC_LOGGER.info(f"{report[:-1].title()}ing {path_key}")
315 else:
316 SYNC_LOGGER.info(
317 f"Within {path_key}...\n%s",
318 "\n".join(
319 f" - {op[:-1].title()}ing {count} file{'' if count == 1 else 's'}"
320 for op, count in report.items()
321 ),
322 )
323 return stats
326def _is_important_stats_line(line: str) -> bool:
327 """Determine if a stats line is worth logging at the INFO level (or whether
328 it should be relegated to the DEBUG log)
330 Parameters
331 ----------
332 line : str
333 The log line to evaluate
335 Returns
336 -------
337 bool
338 True if and only if the line is identified as important
339 """
340 return line.startswith(
341 (
342 "Number of created files:",
343 "Number of deleted files:",
344 "Number of regular files transferred:",
345 "Total transferred file size:",
346 )
347 )
350def pull(
351 remote_uri: ParseResult,
352 local_path: Path,
353 exclude: Iterable[str],
354 dry_run: bool,
355 use_daemon: bool = False,
356 timeout: int | None = None,
357 delete: bool = True,
358 verbosity: int = 0,
359 rsync_args: Iterable[str] | None = None,
360) -> None:
361 """Sync an upstream file or folder into the specified location using rsync.
362 This will overwrite any files and folders already at the destination.
364 Parameters
365 ----------
366 remote_uri : ParseResult
367 The URI for the remote resource to copy from
368 local_path : Path
369 The destination folder
370 exclude : list of str
371 Any patterns that should be excluded from the sync
372 dry_run : bool
373 Whether to only simulate this sync (report the operations to be performed
374 but not actually perform them)
375 use_daemon : bool, optional
376 By default, the rsync is performed over ssh. If you happen to have an
377 rsync daemon running on your system, however, you're welcome to leverage
378 it instead by passing in `use_daemon=True`
379 timeout : int, optional
380 The number of seconds to wait before timing out the sync operation.
381 If None is provided, no explicit timeout value will be set.
382 delete : bool, optional
383 Whether part of the syncing should include deleting files at the destination
384 that aren't at the source. Default is True.
385 verbosity : int
386 A modifier for how much info to output either to stdout or the INFO-level
387 logs. Defaults to 0.
388 rsync_args: list of str, optional
389 Any additional arguments to pass into rsync. Note that rsync is run by
390 default with the flags: `-shaz`
392 Raises
393 ------
394 FileNotFoundError
395 If the destination folder does not exist
397 Notes
398 -----
399 - This method does not provide for interactive authentication. If using
400 rsync over SSH, you'll need to be set up for password-less (key-based)
401 access.
402 - If the destination folder does not already exist, this method will not
403 create it or its parent directories.
404 """
405 if not local_path.exists():
406 raise FileNotFoundError(f"{local_path} does not exist")
408 if remote_uri.netloc == get_default_netloc():
409 SYNC_LOGGER.debug("Performing sync as a local transfer")
410 remote_path: str = unquote(remote_uri.path)
411 elif use_daemon:
412 remote_path = remote_uri.geturl()
413 else:
414 remote_path = uri_to_ssh(remote_uri)
416 if rsync_args: # pragma: no cover
417 raise NotImplementedError
419 run_rsync(
420 local_path.parent,
421 remote_path,
422 local_path.name,
423 delete,
424 dry_run,
425 exclude,
426 *(rsync_args or ()),
427 timeout=timeout,
428 verbosity=verbosity,
429 )
432def push(
433 local_path: Path,
434 remote_uri: ParseResult,
435 exclude: Iterable[str],
436 dry_run: bool,
437 use_daemon: bool = False,
438 timeout: int | None = None,
439 delete: bool = True,
440 verbosity: int = 0,
441 rsync_args: Iterable[str] | None = None,
442) -> None:
443 """Sync a local file or folder into the specified location using rsync.
444 This will overwrite any files and folders already at the destination.
446 Parameters
447 ----------
448 local_path : Path
449 The file or folder to copy
450 remote_uri : ParseResult
451 The URI for the remote location to copy into
452 exclude : list of str
453 Any patterns that should be excluded from the sync
454 dry_run : bool
455 Whether to only simulate this sync (report the operations to be performed
456 but not actually perform them)
457 use_daemon : bool, optional
458 By default, the rsync is performed over ssh. If you happen to have an
459 rsync daemon running on your system, however, you're welcome to leverage
460 it instead by passing in `use_daemon=True`
461 timeout : int, optional
462 The number of seconds to wait before timing out the sync operation.
463 If None is provided, no explicit timeout value will be set.
464 delete : bool, optional
465 Whether part of the syncing should include deleting files at the destination
466 that aren't at the source. Default is True.
467 verbosity : int
468 A modifier for how much info to output either to stdout or the INFO-level
469 logs. Defaults to 0.
470 rsync_args: list of str, optional
471 Any additional arguments to pass into rsync. Note that rsync is run by
472 default with the flags: `-shaz`
474 Notes
475 -----
476 - This method does not provide for interactive authentication. If using
477 rsync over SSH, you'll need to be set up for password-less (key-based)
478 access.
479 - If the destination folder does not already exist, this method will very
480 likely fail.
481 """
482 if remote_uri.netloc == get_default_netloc():
483 SYNC_LOGGER.debug("Performing sync as a local transfer")
484 remote_path: str = unquote(remote_uri.path)
485 elif use_daemon:
486 remote_path = remote_uri.geturl()
487 else:
488 remote_path = uri_to_ssh(remote_uri)
490 if rsync_args: # pragma: no cover
491 raise NotImplementedError
493 run_rsync(
494 local_path.parent,
495 local_path.name,
496 remote_path,
497 delete,
498 dry_run,
499 exclude,
500 *(rsync_args or ()),
501 timeout=timeout,
502 verbosity=verbosity,
503 )