Coverage for enderchest/sync/rsync.py: 87%

138 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-04 01:41 +0000

1"""rsync sync implementation. Relies on the user having rsync installed on their system""" 

2 

3import os.path 

4import re 

5import shutil 

6import subprocess 

7from collections import defaultdict 

8from pathlib import Path 

9from typing import Iterable 

10from urllib.parse import ParseResult, unquote 

11 

12from . import SYNC_LOGGER, get_default_netloc, uri_to_ssh 

13 

14RSYNC = shutil.which("rsync") 

15if RSYNC is None: # pragma: no cover 

16 raise RuntimeError( 

17 "No rsync executable found on your system. Cannot sync using this protocol." 

18 ) 

19 

20 

21def _get_rsync_version() -> tuple[int, int]: 

22 """Determine the installed version of Rsync 

23 

24 Returns 

25 ------- 

26 int 

27 The major version of the resolved Rsync executable 

28 int 

29 The minor version of the resolved Rsync executable 

30 

31 Raises 

32 ----- 

33 RuntimeError 

34 If Rsync is not installed, if `rsync --version` returns an error or if 

35 the version information cannot be decoded from the `rsync --version` 

36 output 

37 """ 

38 try: 

39 result = subprocess.run( 

40 ["rsync", "--version"], 

41 stdout=subprocess.PIPE, 

42 stderr=subprocess.PIPE, 

43 check=False, 

44 ) 

45 if result.stderr: # TODO: #124 just use check=True 

46 raise RuntimeError(result.stderr.decode("utf-8")) 

47 

48 head = result.stdout.decode("utf-8").splitlines()[0] 

49 except (FileNotFoundError, IndexError): 

50 raise RuntimeError("Rsync is not installed or could not be executed.") 

51 

52 try: 

53 if match := re.match( 

54 r"^rsync[\s]+version ([0-9]+).([0-9]+).([0-9]+)", 

55 head, 

56 ): 

57 major, minor, *_ = match.groups() 

58 return int(major), int(minor) 

59 raise AssertionError 

60 except (AssertionError, ValueError): 

61 raise RuntimeError(f"Could not parse version output:\n{head}") 

62 

63 

64rsync_version = _get_rsync_version() 

65if rsync_version < (3, 2): 

66 raise RuntimeError( 

67 "EnderChest requires Rsync 3.2 or newer." 

68 " The version detected on your system is {}.{}".format(*rsync_version) 

69 ) 

70 

71 

72def run_rsync( 

73 working_directory: Path, 

74 source: str, 

75 destination_folder: str, 

76 delete: bool, 

77 dry_run: bool, 

78 exclude: Iterable[str], 

79 *additional_args: str, 

80 timeout: int | None = None, 

81 verbosity: int = 0, 

82 rsync_flags: str | None = None, 

83) -> None: 

84 """Run an operation with rsync 

85 

86 Parameters 

87 ---------- 

88 working_directory : Path 

89 The working directory to run the sync command from 

90 source : str 

91 The source file or folder to sync, specified as either a URI string, 

92 an ssh address or a path relative to the working directory 

93 destination_folder : str 

94 The destination folder where the file or folder should be synced to, 

95 with the same formats available as for source 

96 delete : bool 

97 Whether part of the syncing should include deleting files at the destination 

98 that aren't at the source 

99 dry_run : bool 

100 Whether to only simulate this sync (report the operations to be performed 

101 but not actually perform them) 

102 exclude : list of str 

103 Any patterns that should be excluded from the sync (and sync) 

104 *additional_args : str 

105 Any additional arguments to pass into the rsync command 

106 timeout : int, optional 

107 The number of seconds to wait before timing out the sync operation. 

108 If None is provided, no explicit timeout value will be set. 

109 verbosity : int 

110 A modifier for how much info to output either to stdout or the INFO-level 

111 logs. At... 

112 

113 - verbosity = -2 : No information will be printed, even on dry runs 

114 - verbosity = -1 : The sync itself will be silent. Dry runs will only 

115 report the sync statistics. 

116 - verbosity = 0 : Actual syncs will display a progress bar. Dry run 

117 reports will summarize the changes to each shulker 

118 box in addition to reporting the sync statistics . 

119 - verbosity = 1 : Actual syncs will report the progress of each file 

120 transfer. Dry runs will report on each file to 

121 be created, updated or deleted. 

122 - verbosity = 2 : Dry runs and syncs will print or log the output 

123 of rsync run using the `-vv` modifier 

124 

125 Verbosity values outside of this range will simply be capped / floored 

126 to [-2, 2]. 

127 rsync_flags : str, optional 

128 By default, rsync will be run using the flags "shaz" which means: 

129 

130 - no space splitting 

131 - use output (file sizes, mostly) human-readable 

132 - archive mode (see: https://www.baeldung.com/linux/rsync-archive-mode) 

133 - compress data during transfer 

134 

135 Advanced users may choose to override these options, but **you do so 

136 at your own peril**. 

137 

138 Raises 

139 ------ 

140 TimeoutError 

141 If the rsync operation times out before completion 

142 RuntimeError 

143 If the rsync operation fails for any other reason 

144 

145 Notes 

146 ----- 

147 This method does not perform any validation or normalization of the source, 

148 destination, exclude-list, additional arguments or rsync options. 

149 """ 

150 rsync_flags = rsync_flags or "shaz" 

151 

152 args: list[str] = [RSYNC, f"-{rsync_flags}"] # type: ignore[list-item] 

153 if delete: 

154 args.append("--delete") 

155 if dry_run: 

156 args.extend(("--dry-run", "--stats")) 

157 if verbosity < 1: 

158 # at 1+ we don't need it to be machine-parseable 

159 args.append("--out-format=%i %n") 

160 else: 

161 if verbosity >= 0: 

162 args.append("--stats") 

163 if verbosity == 0: 

164 args.append("--info=progress2") 

165 if verbosity >= 1: 

166 args.append("--progress") 

167 if verbosity > 0: 

168 args.append("-" + "v" * verbosity) 

169 

170 for pattern in exclude: 

171 args.extend(("--exclude", pattern)) 

172 args.extend(additional_args) 

173 args.extend((source, destination_folder)) 

174 

175 SYNC_LOGGER.debug( 

176 "Executing the following command:\n %s", 

177 " ".join(args), 

178 ) 

179 

180 with subprocess.Popen( 

181 args, 

182 stdout=subprocess.PIPE if dry_run else None, 

183 stderr=subprocess.PIPE, 

184 cwd=working_directory, 

185 ) as proc: 

186 if timeout: 

187 try: 

188 proc.wait(timeout) 

189 except subprocess.TimeoutExpired as times_up: 

190 proc.kill() 

191 if proc.stdout is not None: 

192 if output_log := proc.stdout.read().decode("UTF-8"): 

193 SYNC_LOGGER.warning(output_log) 

194 if proc.stderr is not None: 

195 if error_log := proc.stderr.read().decode("UTF-8"): 

196 SYNC_LOGGER.error(error_log) 

197 raise TimeoutError("Timeout reached.") from times_up 

198 

199 if proc.stdout is not None: 

200 if output_log := proc.stdout.read().decode("UTF-8"): 

201 if verbosity > 0: 

202 dry_run_output = output_log.splitlines() 

203 else: 

204 dry_run_output = summarize_rsync_report(output_log) 

205 SYNC_LOGGER.info("\nSUMMARY\n-------") 

206 for line in dry_run_output: 

207 if _is_important_stats_line(line): 

208 SYNC_LOGGER.log(25, line) 

209 else: 

210 SYNC_LOGGER.debug(line) 

211 

212 if proc.stderr is not None: 

213 if error_log := proc.stderr.read().decode("UTF-8"): 

214 if "No such file or directory" in error_log: 

215 raise FileNotFoundError(error_log) 

216 raise RuntimeError(error_log) # pragma: no cover 

217 

218 

219def summarize_rsync_report(raw_output: str, depth: int = 2) -> list[str]: 

220 """Take the captured output from running 

221 `rsync -ha --out-format="%i %n"` 

222 and report a high-level summary to the logging.INFO level 

223 

224 Parameters 

225 ---------- 

226 raw_output : str 

227 The raw output captured from running the rsync command 

228 depth : int, optional 

229 How many directories to go down from the root to generate the summary. 

230 Default is 2 (just report on top-level files and folders within the 

231 source folder). 

232 

233 Returns 

234 ------- 

235 list of str 

236 Any lines that weren't part of the rsync report (and were probably 

237 part of `--stats`?) 

238 

239 Notes 

240 ----- 

241 The rsync man page (https://linux.die.net/man/1/rsync) describes the output 

242 format as... "cryptic," which I find rather charitable. The relevant bits 

243 are that `--out-format="%i %n"` produces: 

244 - `%i` : a string of 11 characters that gives various metadata about the file 

245 transfer operation (is it a file, a directory or a link? Is it being 

246 sent or received? Created, updated or deleted?) 

247 - `%n`: the path of the file (or whatever), unquoted, un-escaped 

248 """ 

249 summary: dict[str, dict[str, int] | str] = defaultdict( 

250 lambda: {"create": 0, "update": 0, "delete": 0} 

251 ) 

252 stats: list[str] = [] 

253 for line in raw_output.splitlines(): 

254 if line == "": # skip empty lines 

255 continue 

256 

257 info = line.split()[0] 

258 full_path = os.path.normpath(" ".join(line.split()[1:])) 

259 path_key = os.sep.join(full_path.split(os.sep)[:depth]) 

260 

261 if info.startswith("*deleting"): 

262 if full_path == path_key: 

263 summary[path_key] = "delete" 

264 else: 

265 entry = summary[path_key] 

266 if not isinstance(entry, str): 

267 entry["delete"] += 1 

268 # otherwise the whole thing is being deleted 

269 elif info[2:5] == "+++": # this is a creation 

270 if full_path == path_key: 

271 summary[path_key] = "create" 

272 else: 

273 if info[1] != "d": # don't count directories 

274 entry = summary[path_key] 

275 if isinstance(entry, str): 

276 # then this is described by the top-level op 

277 pass 

278 else: 

279 entry["create"] += 1 

280 # otherwise the whole key is being created 

281 elif info[:2] in ("<f", ">f"): # file transfer 

282 # and remember that creates were caught above, so this must be an update 

283 if full_path == path_key: 

284 summary[path_key] = "update" 

285 else: 

286 entry = summary[path_key] 

287 if isinstance(entry, str): # pragma: no cover 

288 # this should never happen, but still 

289 pass 

290 else: 

291 entry["update"] += 1 

292 elif info[:2] == "cL": # this is replacing a link, as far as I can tell 

293 if full_path == path_key: 

294 summary[path_key] = "update" 

295 else: 

296 entry = summary[path_key] 

297 if isinstance(entry, str): # pragma: no cover 

298 # this should never happen, but still 

299 pass 

300 else: 

301 entry["update"] += 1 

302 elif info[:1] == ".": # pragma: no cover 

303 # this just means permissions or dates are being updated or something 

304 pass 

305 else: # then hopefully this is part of the stats report 

306 stats.append(line) 

307 continue 

308 

309 SYNC_LOGGER.debug(line) 

310 

311 for path_key, report in sorted(summary.items()): 

312 if isinstance(report, str): 

313 # nice that these verbs follow the same pattern 

314 SYNC_LOGGER.info(f"{report[:-1].title()}ing {path_key}") 

315 else: 

316 SYNC_LOGGER.info( 

317 f"Within {path_key}...\n%s", 

318 "\n".join( 

319 f" - {op[:-1].title()}ing {count} file{'' if count == 1 else 's'}" 

320 for op, count in report.items() 

321 ), 

322 ) 

323 return stats 

324 

325 

326def _is_important_stats_line(line: str) -> bool: 

327 """Determine if a stats line is worth logging at the INFO level (or whether 

328 it should be relegated to the DEBUG log) 

329 

330 Parameters 

331 ---------- 

332 line : str 

333 The log line to evaluate 

334 

335 Returns 

336 ------- 

337 bool 

338 True if and only if the line is identified as important 

339 """ 

340 return line.startswith( 

341 ( 

342 "Number of created files:", 

343 "Number of deleted files:", 

344 "Number of regular files transferred:", 

345 "Total transferred file size:", 

346 ) 

347 ) 

348 

349 

350def pull( 

351 remote_uri: ParseResult, 

352 local_path: Path, 

353 exclude: Iterable[str], 

354 dry_run: bool, 

355 use_daemon: bool = False, 

356 timeout: int | None = None, 

357 delete: bool = True, 

358 verbosity: int = 0, 

359 rsync_args: Iterable[str] | None = None, 

360) -> None: 

361 """Sync an upstream file or folder into the specified location using rsync. 

362 This will overwrite any files and folders already at the destination. 

363 

364 Parameters 

365 ---------- 

366 remote_uri : ParseResult 

367 The URI for the remote resource to copy from 

368 local_path : Path 

369 The destination folder 

370 exclude : list of str 

371 Any patterns that should be excluded from the sync 

372 dry_run : bool 

373 Whether to only simulate this sync (report the operations to be performed 

374 but not actually perform them) 

375 use_daemon : bool, optional 

376 By default, the rsync is performed over ssh. If you happen to have an 

377 rsync daemon running on your system, however, you're welcome to leverage 

378 it instead by passing in `use_daemon=True` 

379 timeout : int, optional 

380 The number of seconds to wait before timing out the sync operation. 

381 If None is provided, no explicit timeout value will be set. 

382 delete : bool, optional 

383 Whether part of the syncing should include deleting files at the destination 

384 that aren't at the source. Default is True. 

385 verbosity : int 

386 A modifier for how much info to output either to stdout or the INFO-level 

387 logs. Defaults to 0. 

388 rsync_args: list of str, optional 

389 Any additional arguments to pass into rsync. Note that rsync is run by 

390 default with the flags: `-shaz` 

391 

392 Raises 

393 ------ 

394 FileNotFoundError 

395 If the destination folder does not exist 

396 

397 Notes 

398 ----- 

399 - This method does not provide for interactive authentication. If using 

400 rsync over SSH, you'll need to be set up for password-less (key-based) 

401 access. 

402 - If the destination folder does not already exist, this method will not 

403 create it or its parent directories. 

404 """ 

405 if not local_path.exists(): 

406 raise FileNotFoundError(f"{local_path} does not exist") 

407 

408 if remote_uri.netloc == get_default_netloc(): 

409 SYNC_LOGGER.debug("Performing sync as a local transfer") 

410 remote_path: str = unquote(remote_uri.path) 

411 elif use_daemon: 

412 remote_path = remote_uri.geturl() 

413 else: 

414 remote_path = uri_to_ssh(remote_uri) 

415 

416 if rsync_args: # pragma: no cover 

417 raise NotImplementedError 

418 

419 run_rsync( 

420 local_path.parent, 

421 remote_path, 

422 local_path.name, 

423 delete, 

424 dry_run, 

425 exclude, 

426 *(rsync_args or ()), 

427 timeout=timeout, 

428 verbosity=verbosity, 

429 ) 

430 

431 

432def push( 

433 local_path: Path, 

434 remote_uri: ParseResult, 

435 exclude: Iterable[str], 

436 dry_run: bool, 

437 use_daemon: bool = False, 

438 timeout: int | None = None, 

439 delete: bool = True, 

440 verbosity: int = 0, 

441 rsync_args: Iterable[str] | None = None, 

442) -> None: 

443 """Sync a local file or folder into the specified location using rsync. 

444 This will overwrite any files and folders already at the destination. 

445 

446 Parameters 

447 ---------- 

448 local_path : Path 

449 The file or folder to copy 

450 remote_uri : ParseResult 

451 The URI for the remote location to copy into 

452 exclude : list of str 

453 Any patterns that should be excluded from the sync 

454 dry_run : bool 

455 Whether to only simulate this sync (report the operations to be performed 

456 but not actually perform them) 

457 use_daemon : bool, optional 

458 By default, the rsync is performed over ssh. If you happen to have an 

459 rsync daemon running on your system, however, you're welcome to leverage 

460 it instead by passing in `use_daemon=True` 

461 timeout : int, optional 

462 The number of seconds to wait before timing out the sync operation. 

463 If None is provided, no explicit timeout value will be set. 

464 delete : bool, optional 

465 Whether part of the syncing should include deleting files at the destination 

466 that aren't at the source. Default is True. 

467 verbosity : int 

468 A modifier for how much info to output either to stdout or the INFO-level 

469 logs. Defaults to 0. 

470 rsync_args: list of str, optional 

471 Any additional arguments to pass into rsync. Note that rsync is run by 

472 default with the flags: `-shaz` 

473 

474 Notes 

475 ----- 

476 - This method does not provide for interactive authentication. If using 

477 rsync over SSH, you'll need to be set up for password-less (key-based) 

478 access. 

479 - If the destination folder does not already exist, this method will very 

480 likely fail. 

481 """ 

482 if remote_uri.netloc == get_default_netloc(): 

483 SYNC_LOGGER.debug("Performing sync as a local transfer") 

484 remote_path: str = unquote(remote_uri.path) 

485 elif use_daemon: 

486 remote_path = remote_uri.geturl() 

487 else: 

488 remote_path = uri_to_ssh(remote_uri) 

489 

490 if rsync_args: # pragma: no cover 

491 raise NotImplementedError 

492 

493 run_rsync( 

494 local_path.parent, 

495 local_path.name, 

496 remote_path, 

497 delete, 

498 dry_run, 

499 exclude, 

500 *(rsync_args or ()), 

501 timeout=timeout, 

502 verbosity=verbosity, 

503 )