Coverage for enderchest/sync/rsync.py: 86%
119 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-03 20:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-03 20:14 +0000
1"""rsync sync implementation. Relies on the user having rsync installed on their system"""
2import os.path
3import shutil
4import subprocess
5from collections import defaultdict
6from pathlib import Path
7from typing import Iterable
8from urllib.parse import ParseResult, unquote
10from . import SYNC_LOGGER, get_default_netloc, uri_to_ssh
12RSYNC = shutil.which("rsync")
13if RSYNC is None: # pragma: no cover
14 raise RuntimeError("No rsync executable found on your system. Cannot sync using.")
17def run_rsync(
18 working_directory: Path,
19 source: str,
20 destination_folder: str,
21 delete: bool,
22 dry_run: bool,
23 exclude: Iterable[str],
24 *additional_args: str,
25 timeout: int | None = None,
26 verbosity: int = 0,
27 rsync_flags: str | None = None,
28) -> None:
29 """Run an operation with rsync
31 Parameters
32 ----------
33 working_directory : Path
34 The working directory to run the sync command from
35 source : str
36 The source file or folder to sync, specified as either a URI string,
37 an ssh address or a path relative to the working directory
38 destination_folder : str
39 The destination folder where the file or folder should be synced to,
40 with the same formats available as for source
41 delete : bool
42 Whether part of the syncing should include deleting files at the destination
43 that aren't at the source
44 dry_run : bool
45 Whether to only simulate this sync (report the operations to be performed
46 but not actually perform them)
47 exclude : list of str
48 Any patterns that should be excluded from the sync (and sync)
49 *additional_args : str
50 Any additional arguments to pass into the rsync command
51 timeout : int, optional
52 The number of seconds to wait before timing out the sync operation.
53 If None is provided, no explicit timeout value will be set.
54 verbosity : int
55 A modifier for how much info to output either to stdout or the INFO-level
56 logs. At...
58 - verbosity = -2 : No information will be printed, even on dry runs
59 - verbosity = -1 : The sync itself will be silent. Dry runs will only
60 report the sync statistics.
61 - verbosity = 0 : Actual syncs will display a progress bar. Dry run
62 reports will summarize the changes to each shulker
63 box in addition to reporting the sync statistics .
64 - verbosity = 1 : Actual syncs will report the progress of each file
65 transfer. Dry runs will report on each file to
66 be created, updated or deleted.
67 - verbosity = 2 : Dry runs and syncs will print or log the output
68 of rsync run using the `-vv` modifier
70 Verbosity values outside of this range will simply be capped / floored
71 to [-2, 2].
72 rsync_flags : str, optional
73 By default, rsync will be run using the flags "shaz" which means:
75 - no space splitting
76 - use output (file sizes, mostly) human-readable
77 - archive mode (see: https://www.baeldung.com/linux/rsync-archive-mode)
78 - compress data during transfer
80 Advanced users may choose to override these options, but **you do so
81 at your own peril**.
83 Raises
84 ------
85 TimeoutError
86 If the rsync operation times out before completion
87 RuntimeError
88 If the rsync operation fails for any other reason
90 Notes
91 -----
92 This method does not perform any validation or normalization of the source,
93 destination, exclude-list, additional arguments or rsync options.
94 """
95 rsync_flags = rsync_flags or "shaz"
97 args: list[str] = [RSYNC, f"-{rsync_flags}"] # type: ignore[list-item]
98 if delete:
99 args.append("--delete")
100 if dry_run:
101 args.extend(("--dry-run", "--stats"))
102 if verbosity < 1:
103 # at 1+ we don't need it to be machine-parseable
104 args.append("--out-format=%i %n")
105 else:
106 if verbosity >= 0:
107 args.append("--stats")
108 if verbosity == 0:
109 args.append("--info=progress2")
110 if verbosity >= 1:
111 args.append("--progress")
112 if verbosity > 0:
113 args.append("-" + "v" * verbosity)
115 for pattern in exclude:
116 args.extend(("--exclude", pattern))
117 args.extend(additional_args)
118 args.extend((source, destination_folder))
120 SYNC_LOGGER.debug(
121 "Executing the following command:\n %s",
122 " ".join(args),
123 )
125 with subprocess.Popen(
126 args,
127 stdout=subprocess.PIPE if dry_run else None,
128 stderr=subprocess.PIPE,
129 cwd=working_directory,
130 ) as proc:
131 if timeout:
132 try:
133 proc.wait(timeout)
134 except subprocess.TimeoutExpired as times_up:
135 proc.kill()
136 if proc.stdout is not None:
137 if output_log := proc.stdout.read().decode("UTF-8"):
138 SYNC_LOGGER.warning(output_log)
139 if proc.stderr is not None:
140 if error_log := proc.stderr.read().decode("UTF-8"):
141 SYNC_LOGGER.error(error_log)
142 raise TimeoutError("Timeout reached.") from times_up
144 if proc.stdout is not None:
145 if output_log := proc.stdout.read().decode("UTF-8"):
146 if verbosity > 0:
147 dry_run_output = output_log.splitlines()
148 else:
149 dry_run_output = summarize_rsync_report(output_log)
150 SYNC_LOGGER.info("\nSUMMARY\n-------")
151 for line in dry_run_output:
152 if _is_important_stats_line(line):
153 SYNC_LOGGER.log(25, line)
154 else:
155 SYNC_LOGGER.debug(line)
157 if proc.stderr is not None:
158 if error_log := proc.stderr.read().decode("UTF-8"):
159 if "No such file or directory" in error_log:
160 raise FileNotFoundError(error_log)
161 raise RuntimeError(error_log) # pragma: no cover
164def summarize_rsync_report(raw_output: str, depth: int = 2) -> list[str]:
165 """Take the captured output from running
166 `rsync -ha --out-format="%i %n"`
167 and report a high-level summary to the logging.INFO level
169 Parameters
170 ----------
171 raw_output : str
172 The raw output captured from running the rsync command
173 depth : int, optional
174 How many directories to go down from the root to generate the summary.
175 Default is 2 (just report on top-level files and folders within the
176 source folder).
178 Returns
179 -------
180 list of str
181 Any lines that weren't part of the rsync report (and were probably
182 part of `--stats`?)
184 Notes
185 -----
186 The rsync man page (https://linux.die.net/man/1/rsync) describes the output
187 format as... "cryptic," which I find rather charitable. The relevant bits
188 are that `--out-format="%i %n"` produces:
189 - `%i` : a string of 11 characters that gives various metadata about the file
190 transfer operation (is it a file, a directory or a link? Is it being
191 sent or received? Created, updated or deleted?)
192 - `%n`: the path of the file (or whatever), unquoted, un-escaped
193 """
194 summary: dict[str, dict[str, int] | str] = defaultdict(
195 lambda: {"create": 0, "update": 0, "delete": 0}
196 )
197 stats: list[str] = []
198 for line in raw_output.splitlines():
199 if line == "": # skip empty lines
200 continue
202 info = line.split()[0]
203 full_path = os.path.normpath(" ".join(line.split()[1:]))
204 path_key = os.sep.join(full_path.split(os.sep)[:depth])
206 if info.startswith("*deleting"):
207 if full_path == path_key:
208 summary[path_key] = "delete"
209 else:
210 entry = summary[path_key]
211 if not isinstance(entry, str):
212 entry["delete"] += 1
213 # otherwise the whole thing is being deleted
214 elif info[2:5] == "+++": # this is a creation
215 if full_path == path_key:
216 summary[path_key] = "create"
217 else:
218 if info[1] != "d": # don't count directories
219 entry = summary[path_key]
220 if isinstance(entry, str):
221 # then this is described by the top-level op
222 pass
223 else:
224 entry["create"] += 1
225 # otherwise the whole key is being created
226 elif info[:2] in ("<f", ">f"): # file transfer
227 # and remember that creates were caught above, so this must be an update
228 if full_path == path_key:
229 summary[path_key] = "update"
230 else:
231 entry = summary[path_key]
232 if isinstance(entry, str): # pragma: no cover
233 # this should never happen, but still
234 pass
235 else:
236 entry["update"] += 1
237 elif info[:2] == "cL": # this is replacing a link, as far as I can tell
238 if full_path == path_key:
239 summary[path_key] = "update"
240 else:
241 entry = summary[path_key]
242 if isinstance(entry, str): # pragma: no cover
243 # this should never happen, but still
244 pass
245 else:
246 entry["update"] += 1
247 elif info[:1] == ".": # pragma: no cover
248 # this just means permissions or dates are being updated or something
249 pass
250 else: # then hopefully this is part of the stats report
251 stats.append(line)
252 continue
254 SYNC_LOGGER.debug(line)
256 for path_key, report in sorted(summary.items()):
257 if isinstance(report, str):
258 # nice that these verbs follow the same pattern
259 SYNC_LOGGER.info(f"{report[:-1].title()}ing {path_key}")
260 else:
261 SYNC_LOGGER.info(
262 f"Within {path_key}...\n%s",
263 "\n".join(
264 f" - {op[:-1].title()}ing {count} file{'' if count == 1 else 's'}"
265 for op, count in report.items()
266 ),
267 )
268 return stats
271def _is_important_stats_line(line: str) -> bool:
272 """Determine if a stats line is worth logging at the INFO level (or whether
273 it should be relegated to the DEBUG log)
275 Parameters
276 ----------
277 line : str
278 The log line to evaluate
280 Returns
281 -------
282 bool
283 True if and only if the line is identified as important
284 """
285 return line.startswith(
286 (
287 "Number of created files:",
288 "Number of deleted files:",
289 "Number of regular files transferred:",
290 "Total transferred file size:",
291 )
292 )
295def pull(
296 remote_uri: ParseResult,
297 local_path: Path,
298 exclude: Iterable[str],
299 dry_run: bool,
300 use_daemon: bool = False,
301 timeout: int | None = None,
302 delete: bool = True,
303 verbosity: int = 0,
304 rsync_args: Iterable[str] | None = None,
305) -> None:
306 """Sync an upstream file or folder into the specified location using rsync.
307 This will overwrite any files and folders already at the destination.
309 Parameters
310 ----------
311 remote_uri : ParseResult
312 The URI for the remote resource to copy from
313 local_path : Path
314 The destination folder
315 exclude : list of str
316 Any patterns that should be excluded from the sync
317 dry_run : bool
318 Whether to only simulate this sync (report the operations to be performed
319 but not actually perform them)
320 use_daemon : bool, optional
321 By default, the rsync is performed over ssh. If you happen to have an
322 rsync daemon running on your system, however, you're welcome to leverage
323 it instead by passing in `use_daemon=True`
324 timeout : int, optional
325 The number of seconds to wait before timing out the sync operation.
326 If None is provided, no explicit timeout value will be set.
327 delete : bool, optional
328 Whether part of the syncing should include deleting files at the destination
329 that aren't at the source. Default is True.
330 verbosity : int
331 A modifier for how much info to output either to stdout or the INFO-level
332 logs. Defaults to 0.
333 rsync_args: list of str, optional
334 Any additional arguments to pass into rsync. Note that rsync is run by
335 default with the flags: `-shaz`
337 Raises
338 ------
339 FileNotFoundError
340 If the destination folder does not exist
342 Notes
343 -----
344 - This method does not provide for interactive authentication. If using
345 rsync over SSH, you'll need to be set up for password-less (key-based)
346 access.
347 - If the destination folder does not already exist, this method will not
348 create it or its parent directories.
349 """
350 if not local_path.exists():
351 raise FileNotFoundError(f"{local_path} does not exist")
353 if remote_uri.netloc == get_default_netloc():
354 SYNC_LOGGER.debug("Performing sync as a local transfer")
355 remote_path: str = unquote(remote_uri.path)
356 elif use_daemon:
357 remote_path = remote_uri.geturl()
358 else:
359 remote_path = uri_to_ssh(remote_uri)
361 if rsync_args: # pragma: no cover
362 raise NotImplementedError
364 run_rsync(
365 local_path.parent,
366 remote_path,
367 local_path.name,
368 delete,
369 dry_run,
370 exclude,
371 *(rsync_args or ()),
372 timeout=timeout,
373 verbosity=verbosity,
374 )
377def push(
378 local_path: Path,
379 remote_uri: ParseResult,
380 exclude: Iterable[str],
381 dry_run: bool,
382 use_daemon: bool = False,
383 timeout: int | None = None,
384 delete: bool = True,
385 verbosity: int = 0,
386 rsync_args: Iterable[str] | None = None,
387) -> None:
388 """Sync a local file or folder into the specified location using rsync.
389 This will overwrite any files and folders already at the destination.
391 Parameters
392 ----------
393 local_path : Path
394 The file or folder to copy
395 remote_uri : ParseResult
396 The URI for the remote location to copy into
397 exclude : list of str
398 Any patterns that should be excluded from the sync
399 dry_run : bool
400 Whether to only simulate this sync (report the operations to be performed
401 but not actually perform them)
402 use_daemon : bool, optional
403 By default, the rsync is performed over ssh. If you happen to have an
404 rsync daemon running on your system, however, you're welcome to leverage
405 it instead by passing in `use_daemon=True`
406 timeout : int, optional
407 The number of seconds to wait before timing out the sync operation.
408 If None is provided, no explicit timeout value will be set.
409 delete : bool, optional
410 Whether part of the syncing should include deleting files at the destination
411 that aren't at the source. Default is True.
412 verbosity : int
413 A modifier for how much info to output either to stdout or the INFO-level
414 logs. Defaults to 0.
415 rsync_args: list of str, optional
416 Any additional arguments to pass into rsync. Note that rsync is run by
417 default with the flags: `-shaz`
419 Notes
420 -----
421 - This method does not provide for interactive authentication. If using
422 rsync over SSH, you'll need to be set up for password-less (key-based)
423 access.
424 - If the destination folder does not already exist, this method will very
425 likely fail.
426 """
427 if remote_uri.netloc == get_default_netloc():
428 SYNC_LOGGER.debug("Performing sync as a local transfer")
429 remote_path: str = unquote(remote_uri.path)
430 elif use_daemon:
431 remote_path = remote_uri.geturl()
432 else:
433 remote_path = uri_to_ssh(remote_uri)
435 if rsync_args: # pragma: no cover
436 raise NotImplementedError
438 run_rsync(
439 local_path.parent,
440 local_path.name,
441 remote_path,
442 delete,
443 dry_run,
444 exclude,
445 *(rsync_args or ()),
446 timeout=timeout,
447 verbosity=verbosity,
448 )