Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import logging 

2import os 

3import re 

4import tarfile 

5import urllib.request 

6from contextlib import contextmanager 

7from io import BufferedReader 

8from os import PathLike 

9from typing import Dict, Optional, Union 

10 

11from tqdm import tqdm 

12 

13log = logging.getLogger(__name__) 

14 

15 

16def download_file( 

17 url: str, 

18 output_file: Union[str, PathLike], 

19 show_progress=False, 

20 overwrite_existing=False, 

21 headers: Optional[Dict[str, str]] = None, 

22): 

23 """ 

24 Download a file via HTTP[S] to a specified directory 

25 

26 :param url: URL of the file to be downloaded 

27 :param output_file: Destination path for the downloaded file 

28 :param show_progress: show a progress bar using :mod:`tqdm` 

29 :param overwrite_existing: whether to overwrite existing files 

30 :param headers: Optional headers to add to request, e.g. {"Authorization": "Bearer <access_token>" } 

31 """ 

32 if os.path.exists(output_file): 

33 if overwrite_existing: 

34 log.info(f"Overwriting existing file {output_file}") 

35 else: 

36 raise FileExistsError(f"{output_file} exists, skipping download") 

37 

38 os.makedirs(os.path.dirname(output_file), exist_ok=True) 

39 if headers: 

40 headers_list = [(k, v) for k, v in headers.items()] 

41 opener = urllib.request.build_opener() 

42 opener.addheaders = headers_list 

43 urllib.request.install_opener(opener) 

44 if show_progress: 

45 with tqdm(desc=output_file, unit="B", unit_scale=True) as progress: 

46 

47 def update_progress(_, read_size, total_size): 

48 progress.total = total_size 

49 progress.update(read_size) 

50 

51 urllib.request.urlretrieve(url, output_file, reporthook=update_progress) 

52 else: 

53 urllib.request.urlretrieve(url, output_file) 

54 

55 

56@contextmanager 

57def open_file_in_tar( 

58 path: Union[str, PathLike], file_regex: Union[str, re.Pattern] = ".*" 

59) -> BufferedReader: 

60 """ 

61 Opens an archived file in memory without extracting it on disc. Use as context manager: 

62 >>> with open_file_in_tar(...) as fh: pass 

63 

64 :param path: Local file path to the tar archive. 

65 :param file_regex: A regular expression which will be matched against the files in the archive. 

66 The matching file will be returned. 

67 

68 :raises `ValueError`: when the `file_regex` matches multiple or no file in the archive. 

69 """ 

70 if isinstance(file_regex, str): 

71 file_regex = re.compile(file_regex) 

72 

73 with tarfile.open(path) as tar: 

74 file_names = tar.getnames() 

75 matches = list(filter(file_regex.match, file_names)) 

76 if len(matches) != 1: 

77 raise ValueError( 

78 f"Regular expression {file_regex.pattern} matched against zero or multiple files {matches}" 

79 ) 

80 file_name = matches[0] 

81 log.debug(f"Yielding {file_name} from {path}") 

82 with tar.extractfile(file_name) as file: 

83 yield file