Coverage for src/accsr/loading.py : 60%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
2import os
3import re
4import tarfile
5import urllib.request
6from contextlib import contextmanager
7from io import BufferedReader
8from os import PathLike
9from typing import Dict, Optional, Union
11from tqdm import tqdm
13log = logging.getLogger(__name__)
16def download_file(
17 url: str,
18 output_file: Union[str, PathLike],
19 show_progress=False,
20 overwrite_existing=False,
21 headers: Optional[Dict[str, str]] = None,
22):
23 """
24 Download a file via HTTP[S] to a specified directory
26 :param url: URL of the file to be downloaded
27 :param output_file: Destination path for the downloaded file
28 :param show_progress: show a progress bar using :mod:`tqdm`
29 :param overwrite_existing: whether to overwrite existing files
30 :param headers: Optional headers to add to request, e.g. {"Authorization": "Bearer <access_token>" }
31 """
32 if os.path.exists(output_file):
33 if overwrite_existing:
34 log.info(f"Overwriting existing file {output_file}")
35 else:
36 raise FileExistsError(f"{output_file} exists, skipping download")
38 os.makedirs(os.path.dirname(output_file), exist_ok=True)
39 if headers:
40 headers_list = [(k, v) for k, v in headers.items()]
41 opener = urllib.request.build_opener()
42 opener.addheaders = headers_list
43 urllib.request.install_opener(opener)
44 if show_progress:
45 with tqdm(desc=output_file, unit="B", unit_scale=True) as progress:
47 def update_progress(_, read_size, total_size):
48 progress.total = total_size
49 progress.update(read_size)
51 urllib.request.urlretrieve(url, output_file, reporthook=update_progress)
52 else:
53 urllib.request.urlretrieve(url, output_file)
56@contextmanager
57def open_file_in_tar(
58 path: Union[str, PathLike], file_regex: Union[str, re.Pattern] = ".*"
59) -> BufferedReader:
60 """
61 Opens an archived file in memory without extracting it on disc. Use as context manager:
62 >>> with open_file_in_tar(...) as fh: pass
64 :param path: Local file path to the tar archive.
65 :param file_regex: A regular expression which will be matched against the files in the archive.
66 The matching file will be returned.
68 :raises `ValueError`: when the `file_regex` matches multiple or no file in the archive.
69 """
70 if isinstance(file_regex, str):
71 file_regex = re.compile(file_regex)
73 with tarfile.open(path) as tar:
74 file_names = tar.getnames()
75 matches = list(filter(file_regex.match, file_names))
76 if len(matches) != 1:
77 raise ValueError(
78 f"Regular expression {file_regex.pattern} matched against zero or multiple files {matches}"
79 )
80 file_name = matches[0]
81 log.debug(f"Yielding {file_name} from {path}")
82 with tar.extractfile(file_name) as file:
83 yield file