Coverage for enderchest/sync/rsync.py: 87%
138 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-06 16:00 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-06 16:00 +0000
1"""rsync sync implementation. Relies on the user having rsync installed on their system"""
2import os.path
3import re
4import shutil
5import subprocess
6from collections import defaultdict
7from pathlib import Path
8from typing import Iterable
9from urllib.parse import ParseResult, unquote
11from . import SYNC_LOGGER, get_default_netloc, uri_to_ssh
13RSYNC = shutil.which("rsync")
14if RSYNC is None: # pragma: no cover
15 raise RuntimeError(
16 "No rsync executable found on your system. Cannot sync using this protocol."
17 )
20def _get_rsync_version() -> tuple[int, int]:
21 """Determine the installed version of Rsync
23 Returns
24 -------
25 int
26 The major version of the resolved Rsync executable
27 int
28 The minor version of the resolved Rsync executable
30 Raises
31 -----
32 RuntimeError
33 If Rsync is not installed, if `rsync --version` returns an error or if
34 the version information cannot be decoded from the `rsync --version`
35 output
36 """
37 try:
38 result = subprocess.run(
39 ["rsync", "--version"],
40 stdout=subprocess.PIPE,
41 stderr=subprocess.PIPE,
42 check=False,
43 )
44 if result.stderr: # TODO: #124 just use check=True
45 raise RuntimeError(result.stderr.decode("utf-8"))
47 head = result.stdout.decode("utf-8").splitlines()[0]
48 except (FileNotFoundError, IndexError):
49 raise RuntimeError("Rsync is not installed or could not be executed.")
51 try:
52 if match := re.match(
53 r"^rsync[\s]+version ([0-9]+).([0-9]+).([0-9]+)",
54 head,
55 ):
56 major, minor, *_ = match.groups()
57 return int(major), int(minor)
58 raise AssertionError
59 except (AssertionError, ValueError):
60 raise RuntimeError(f"Could not parse version output:\n{head}")
63rsync_version = _get_rsync_version()
64if rsync_version < (3, 2):
65 raise RuntimeError(
66 "EnderChest requires Rsync 3.2 or newer."
67 " The version detected on your system is {}.{}".format(*rsync_version)
68 )
71def run_rsync(
72 working_directory: Path,
73 source: str,
74 destination_folder: str,
75 delete: bool,
76 dry_run: bool,
77 exclude: Iterable[str],
78 *additional_args: str,
79 timeout: int | None = None,
80 verbosity: int = 0,
81 rsync_flags: str | None = None,
82) -> None:
83 """Run an operation with rsync
85 Parameters
86 ----------
87 working_directory : Path
88 The working directory to run the sync command from
89 source : str
90 The source file or folder to sync, specified as either a URI string,
91 an ssh address or a path relative to the working directory
92 destination_folder : str
93 The destination folder where the file or folder should be synced to,
94 with the same formats available as for source
95 delete : bool
96 Whether part of the syncing should include deleting files at the destination
97 that aren't at the source
98 dry_run : bool
99 Whether to only simulate this sync (report the operations to be performed
100 but not actually perform them)
101 exclude : list of str
102 Any patterns that should be excluded from the sync (and sync)
103 *additional_args : str
104 Any additional arguments to pass into the rsync command
105 timeout : int, optional
106 The number of seconds to wait before timing out the sync operation.
107 If None is provided, no explicit timeout value will be set.
108 verbosity : int
109 A modifier for how much info to output either to stdout or the INFO-level
110 logs. At...
112 - verbosity = -2 : No information will be printed, even on dry runs
113 - verbosity = -1 : The sync itself will be silent. Dry runs will only
114 report the sync statistics.
115 - verbosity = 0 : Actual syncs will display a progress bar. Dry run
116 reports will summarize the changes to each shulker
117 box in addition to reporting the sync statistics .
118 - verbosity = 1 : Actual syncs will report the progress of each file
119 transfer. Dry runs will report on each file to
120 be created, updated or deleted.
121 - verbosity = 2 : Dry runs and syncs will print or log the output
122 of rsync run using the `-vv` modifier
124 Verbosity values outside of this range will simply be capped / floored
125 to [-2, 2].
126 rsync_flags : str, optional
127 By default, rsync will be run using the flags "shaz" which means:
129 - no space splitting
130 - use output (file sizes, mostly) human-readable
131 - archive mode (see: https://www.baeldung.com/linux/rsync-archive-mode)
132 - compress data during transfer
134 Advanced users may choose to override these options, but **you do so
135 at your own peril**.
137 Raises
138 ------
139 TimeoutError
140 If the rsync operation times out before completion
141 RuntimeError
142 If the rsync operation fails for any other reason
144 Notes
145 -----
146 This method does not perform any validation or normalization of the source,
147 destination, exclude-list, additional arguments or rsync options.
148 """
149 rsync_flags = rsync_flags or "shaz"
151 args: list[str] = [RSYNC, f"-{rsync_flags}"] # type: ignore[list-item]
152 if delete:
153 args.append("--delete")
154 if dry_run:
155 args.extend(("--dry-run", "--stats"))
156 if verbosity < 1:
157 # at 1+ we don't need it to be machine-parseable
158 args.append("--out-format=%i %n")
159 else:
160 if verbosity >= 0:
161 args.append("--stats")
162 if verbosity == 0:
163 args.append("--info=progress2")
164 if verbosity >= 1:
165 args.append("--progress")
166 if verbosity > 0:
167 args.append("-" + "v" * verbosity)
169 for pattern in exclude:
170 args.extend(("--exclude", pattern))
171 args.extend(additional_args)
172 args.extend((source, destination_folder))
174 SYNC_LOGGER.debug(
175 "Executing the following command:\n %s",
176 " ".join(args),
177 )
179 with subprocess.Popen(
180 args,
181 stdout=subprocess.PIPE if dry_run else None,
182 stderr=subprocess.PIPE,
183 cwd=working_directory,
184 ) as proc:
185 if timeout:
186 try:
187 proc.wait(timeout)
188 except subprocess.TimeoutExpired as times_up:
189 proc.kill()
190 if proc.stdout is not None:
191 if output_log := proc.stdout.read().decode("UTF-8"):
192 SYNC_LOGGER.warning(output_log)
193 if proc.stderr is not None:
194 if error_log := proc.stderr.read().decode("UTF-8"):
195 SYNC_LOGGER.error(error_log)
196 raise TimeoutError("Timeout reached.") from times_up
198 if proc.stdout is not None:
199 if output_log := proc.stdout.read().decode("UTF-8"):
200 if verbosity > 0:
201 dry_run_output = output_log.splitlines()
202 else:
203 dry_run_output = summarize_rsync_report(output_log)
204 SYNC_LOGGER.info("\nSUMMARY\n-------")
205 for line in dry_run_output:
206 if _is_important_stats_line(line):
207 SYNC_LOGGER.log(25, line)
208 else:
209 SYNC_LOGGER.debug(line)
211 if proc.stderr is not None:
212 if error_log := proc.stderr.read().decode("UTF-8"):
213 if "No such file or directory" in error_log:
214 raise FileNotFoundError(error_log)
215 raise RuntimeError(error_log) # pragma: no cover
218def summarize_rsync_report(raw_output: str, depth: int = 2) -> list[str]:
219 """Take the captured output from running
220 `rsync -ha --out-format="%i %n"`
221 and report a high-level summary to the logging.INFO level
223 Parameters
224 ----------
225 raw_output : str
226 The raw output captured from running the rsync command
227 depth : int, optional
228 How many directories to go down from the root to generate the summary.
229 Default is 2 (just report on top-level files and folders within the
230 source folder).
232 Returns
233 -------
234 list of str
235 Any lines that weren't part of the rsync report (and were probably
236 part of `--stats`?)
238 Notes
239 -----
240 The rsync man page (https://linux.die.net/man/1/rsync) describes the output
241 format as... "cryptic," which I find rather charitable. The relevant bits
242 are that `--out-format="%i %n"` produces:
243 - `%i` : a string of 11 characters that gives various metadata about the file
244 transfer operation (is it a file, a directory or a link? Is it being
245 sent or received? Created, updated or deleted?)
246 - `%n`: the path of the file (or whatever), unquoted, un-escaped
247 """
248 summary: dict[str, dict[str, int] | str] = defaultdict(
249 lambda: {"create": 0, "update": 0, "delete": 0}
250 )
251 stats: list[str] = []
252 for line in raw_output.splitlines():
253 if line == "": # skip empty lines
254 continue
256 info = line.split()[0]
257 full_path = os.path.normpath(" ".join(line.split()[1:]))
258 path_key = os.sep.join(full_path.split(os.sep)[:depth])
260 if info.startswith("*deleting"):
261 if full_path == path_key:
262 summary[path_key] = "delete"
263 else:
264 entry = summary[path_key]
265 if not isinstance(entry, str):
266 entry["delete"] += 1
267 # otherwise the whole thing is being deleted
268 elif info[2:5] == "+++": # this is a creation
269 if full_path == path_key:
270 summary[path_key] = "create"
271 else:
272 if info[1] != "d": # don't count directories
273 entry = summary[path_key]
274 if isinstance(entry, str):
275 # then this is described by the top-level op
276 pass
277 else:
278 entry["create"] += 1
279 # otherwise the whole key is being created
280 elif info[:2] in ("<f", ">f"): # file transfer
281 # and remember that creates were caught above, so this must be an update
282 if full_path == path_key:
283 summary[path_key] = "update"
284 else:
285 entry = summary[path_key]
286 if isinstance(entry, str): # pragma: no cover
287 # this should never happen, but still
288 pass
289 else:
290 entry["update"] += 1
291 elif info[:2] == "cL": # this is replacing a link, as far as I can tell
292 if full_path == path_key:
293 summary[path_key] = "update"
294 else:
295 entry = summary[path_key]
296 if isinstance(entry, str): # pragma: no cover
297 # this should never happen, but still
298 pass
299 else:
300 entry["update"] += 1
301 elif info[:1] == ".": # pragma: no cover
302 # this just means permissions or dates are being updated or something
303 pass
304 else: # then hopefully this is part of the stats report
305 stats.append(line)
306 continue
308 SYNC_LOGGER.debug(line)
310 for path_key, report in sorted(summary.items()):
311 if isinstance(report, str):
312 # nice that these verbs follow the same pattern
313 SYNC_LOGGER.info(f"{report[:-1].title()}ing {path_key}")
314 else:
315 SYNC_LOGGER.info(
316 f"Within {path_key}...\n%s",
317 "\n".join(
318 f" - {op[:-1].title()}ing {count} file{'' if count == 1 else 's'}"
319 for op, count in report.items()
320 ),
321 )
322 return stats
325def _is_important_stats_line(line: str) -> bool:
326 """Determine if a stats line is worth logging at the INFO level (or whether
327 it should be relegated to the DEBUG log)
329 Parameters
330 ----------
331 line : str
332 The log line to evaluate
334 Returns
335 -------
336 bool
337 True if and only if the line is identified as important
338 """
339 return line.startswith(
340 (
341 "Number of created files:",
342 "Number of deleted files:",
343 "Number of regular files transferred:",
344 "Total transferred file size:",
345 )
346 )
349def pull(
350 remote_uri: ParseResult,
351 local_path: Path,
352 exclude: Iterable[str],
353 dry_run: bool,
354 use_daemon: bool = False,
355 timeout: int | None = None,
356 delete: bool = True,
357 verbosity: int = 0,
358 rsync_args: Iterable[str] | None = None,
359) -> None:
360 """Sync an upstream file or folder into the specified location using rsync.
361 This will overwrite any files and folders already at the destination.
363 Parameters
364 ----------
365 remote_uri : ParseResult
366 The URI for the remote resource to copy from
367 local_path : Path
368 The destination folder
369 exclude : list of str
370 Any patterns that should be excluded from the sync
371 dry_run : bool
372 Whether to only simulate this sync (report the operations to be performed
373 but not actually perform them)
374 use_daemon : bool, optional
375 By default, the rsync is performed over ssh. If you happen to have an
376 rsync daemon running on your system, however, you're welcome to leverage
377 it instead by passing in `use_daemon=True`
378 timeout : int, optional
379 The number of seconds to wait before timing out the sync operation.
380 If None is provided, no explicit timeout value will be set.
381 delete : bool, optional
382 Whether part of the syncing should include deleting files at the destination
383 that aren't at the source. Default is True.
384 verbosity : int
385 A modifier for how much info to output either to stdout or the INFO-level
386 logs. Defaults to 0.
387 rsync_args: list of str, optional
388 Any additional arguments to pass into rsync. Note that rsync is run by
389 default with the flags: `-shaz`
391 Raises
392 ------
393 FileNotFoundError
394 If the destination folder does not exist
396 Notes
397 -----
398 - This method does not provide for interactive authentication. If using
399 rsync over SSH, you'll need to be set up for password-less (key-based)
400 access.
401 - If the destination folder does not already exist, this method will not
402 create it or its parent directories.
403 """
404 if not local_path.exists():
405 raise FileNotFoundError(f"{local_path} does not exist")
407 if remote_uri.netloc == get_default_netloc():
408 SYNC_LOGGER.debug("Performing sync as a local transfer")
409 remote_path: str = unquote(remote_uri.path)
410 elif use_daemon:
411 remote_path = remote_uri.geturl()
412 else:
413 remote_path = uri_to_ssh(remote_uri)
415 if rsync_args: # pragma: no cover
416 raise NotImplementedError
418 run_rsync(
419 local_path.parent,
420 remote_path,
421 local_path.name,
422 delete,
423 dry_run,
424 exclude,
425 *(rsync_args or ()),
426 timeout=timeout,
427 verbosity=verbosity,
428 )
431def push(
432 local_path: Path,
433 remote_uri: ParseResult,
434 exclude: Iterable[str],
435 dry_run: bool,
436 use_daemon: bool = False,
437 timeout: int | None = None,
438 delete: bool = True,
439 verbosity: int = 0,
440 rsync_args: Iterable[str] | None = None,
441) -> None:
442 """Sync a local file or folder into the specified location using rsync.
443 This will overwrite any files and folders already at the destination.
445 Parameters
446 ----------
447 local_path : Path
448 The file or folder to copy
449 remote_uri : ParseResult
450 The URI for the remote location to copy into
451 exclude : list of str
452 Any patterns that should be excluded from the sync
453 dry_run : bool
454 Whether to only simulate this sync (report the operations to be performed
455 but not actually perform them)
456 use_daemon : bool, optional
457 By default, the rsync is performed over ssh. If you happen to have an
458 rsync daemon running on your system, however, you're welcome to leverage
459 it instead by passing in `use_daemon=True`
460 timeout : int, optional
461 The number of seconds to wait before timing out the sync operation.
462 If None is provided, no explicit timeout value will be set.
463 delete : bool, optional
464 Whether part of the syncing should include deleting files at the destination
465 that aren't at the source. Default is True.
466 verbosity : int
467 A modifier for how much info to output either to stdout or the INFO-level
468 logs. Defaults to 0.
469 rsync_args: list of str, optional
470 Any additional arguments to pass into rsync. Note that rsync is run by
471 default with the flags: `-shaz`
473 Notes
474 -----
475 - This method does not provide for interactive authentication. If using
476 rsync over SSH, you'll need to be set up for password-less (key-based)
477 access.
478 - If the destination folder does not already exist, this method will very
479 likely fail.
480 """
481 if remote_uri.netloc == get_default_netloc():
482 SYNC_LOGGER.debug("Performing sync as a local transfer")
483 remote_path: str = unquote(remote_uri.path)
484 elif use_daemon:
485 remote_path = remote_uri.geturl()
486 else:
487 remote_path = uri_to_ssh(remote_uri)
489 if rsync_args: # pragma: no cover
490 raise NotImplementedError
492 run_rsync(
493 local_path.parent,
494 local_path.name,
495 remote_path,
496 delete,
497 dry_run,
498 exclude,
499 *(rsync_args or ()),
500 timeout=timeout,
501 verbosity=verbosity,
502 )