Coverage for src/accsr/config.py : 68%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Contains helpers for defining and providing configuration classes. A typical usage would be to create the files
3*config.py*, *config.json* and *config_local.json* in a project's root directory. An example of a config.py for a
4data-driven project is below. For a non-data driven project, the configuration class should inherit from
5``accsr.config.ConfigurationBase``, and the resulting class will not have any pre-populated public entries.
7>>> from accsr.config import DefaultDataConfiguration, ConfigProviderBase
8>>>
9>>> class __Configuration(DefaultDataConfiguration):
10... @property
11... def custom_entry_from_config(self):
12... return self._get_non_empty_entry("custom_entry_from_config")
13...
14... @property
15... def existing_path_in_base_dir(self):
16... return self._get_existing_path(["base_dir", "path_in_base_dir"])
17...
18... @property
19... def custom_path_in_processed_data(self):
20... return self.datafile_path("my_data", stage=self.PROCESSED, check_existence=False)
21>>>
22>>> class ConfigProvider(ConfigProviderBase[__Configuration]):
23... pass
24>>>
25>>> _config_provider = ConfigProvider()
26>>>
27>>>
28>>> def get_config(reload=False):
29... return _config_provider.get_config(reload=reload)
31"""
33import inspect
34import json
35import logging.handlers
36import os
37from abc import ABC
38from copy import deepcopy
39from pathlib import Path
40from typing import Callable, Dict, Generic, List, TextIO, Type, TypeVar, Union, get_args
42log = logging.getLogger(__name__)
45def recursive_dict_update(d: Dict, u: Dict):
46 """
47 Modifies d inplace by overwriting with non-dict values from u and updating all dict-values recursively.
48 Returns the modified d.
49 """
50 # From https://stackoverflow.com/a/3233356/1069467
51 for k, v in u.items():
52 if isinstance(v, dict):
53 d[k] = recursive_dict_update(d.get(k, {}), v)
54 else:
55 d[k] = v
56 return d
59def _replace_env_vars(conf: Union[dict], env_var_marker="env:"):
60 for k, v in conf.items():
61 if isinstance(v, str) and v.startswith(env_var_marker):
62 env_var_name = v.lstrip(env_var_marker)
63 conf[k] = os.getenv(env_var_name)
64 elif isinstance(v, dict):
65 _replace_env_vars(v, env_var_marker=env_var_marker)
68def _get_entry_with_replaced_env_vars(
69 entry: Union[str, float, list, dict], env_var_marker="env:"
70):
71 entry = deepcopy(entry)
72 if isinstance(entry, str) and entry.startswith(env_var_marker):
73 env_var_name = entry.lstrip(env_var_marker)
74 return os.getenv(env_var_name)
75 if isinstance(entry, dict):
76 _replace_env_vars(entry, env_var_marker=env_var_marker)
77 return entry
78 if isinstance(entry, list):
79 return [_get_entry_with_replaced_env_vars(v) for v in entry]
80 return entry
83def get_config_reader(filename: str) -> Callable[[TextIO], Dict]:
84 """
85 Returns a reader for yaml or json files. The file type is determined by the file extension.
86 """
87 if filename.endswith(".yaml") or filename.endswith(".yml"):
88 import yaml
90 return yaml.safe_load
91 elif filename.endswith(".json"):
92 return json.load
93 raise ValueError(
94 f"Unsupported file type for {filename}. Supported are .yaml, .yml and .json."
95 )
98class ConfigurationBase(ABC):
99 """
100 Base class for reading and retrieving configuration entries. Do not instantiate this class directly but
101 instead inherit from it.
102 """
104 ENV_VAR_MARKER = "env:"
106 def __init__(
107 self,
108 config_directory: str = None,
109 config_files=("config.json", "config_local.json"),
110 ):
111 """
112 :param config_directory: directory where to look for the config files. Typically, this will be a project's
113 root directory. If None, the directory with the module containing the configuration class definition
114 (inherited from ConfigurationBase) will be used.
115 :param config_files: list of JSON or YAML configuration files (relative to config_directory) from which to read.
116 The filenames should end in .json or .yaml/.yml.
117 The configurations will be merged (dicts are merged, everything else is overwritten),
118 entries more to the right have precedence.
119 Non-existing files from the list will be ignored without errors or warnings. However, at least
120 one file must exist for configuration to be read.
121 """
122 self.config_directory = (
123 config_directory
124 if config_directory is not None
125 else self._module_dir_path()
126 )
127 self.config = {}
128 for filename in config_files:
129 file_path = os.path.join(self.config_directory, filename)
130 file_reader = get_config_reader(filename)
131 if os.path.exists(file_path):
132 log.info(f"Reading configuration from {file_path}")
133 with open(file_path, "r") as f:
134 read_config = file_reader(f)
135 recursive_dict_update(self.config, read_config)
136 if not self.config:
137 raise FileNotFoundError(
138 "No configuration entries could be read from"
139 f"{[os.path.join(self.config_directory, c) for c in config_files]}"
140 )
142 def _module_dir_path(self):
143 module_path = os.path.abspath(inspect.getfile(self.__class__))
144 return os.path.dirname(module_path)
146 def _get_non_empty_entry(
147 self, key: Union[str, List[str]]
148 ) -> Union[float, str, List, Dict]:
149 """
150 Retrieves an entry from the configuration
152 :param key: key or list of keys to go through hierarchically
153 :return: the queried json object
154 """
155 if isinstance(key, str):
156 key = [key]
157 value = self.config
158 for k in key:
159 value = value.get(k)
160 if value is None:
161 raise KeyError(f"Value for key '{key}' not set in configuration")
162 return _get_entry_with_replaced_env_vars(value)
164 def _get_existing_path(self, key: Union[str, List[str]], create=True) -> str:
165 """
166 Retrieves an existing local path from the configuration
168 :param key: key or list of keys to go through hierarchically
169 :param create: if True, a directory with the given path will be created on the fly.
170 :return: the queried path
171 """
172 path_string = self._get_non_empty_entry(key)
173 if os.path.isabs(path_string):
174 path = path_string
175 else:
176 path = os.path.abspath(os.path.join(self.config_directory, path_string))
177 if not os.path.exists(path):
178 if isinstance(key, list):
179 key = ".".join(key) # purely for logging
180 if create:
181 log.info(
182 f"Configured directory {key}='{path}' not found; will create it"
183 )
184 os.makedirs(path)
185 else:
186 raise FileNotFoundError(
187 f"Configured directory {key}='{path}' does not exist."
188 )
189 return path.replace("/", os.sep)
191 def _adjusted_path(self, path: str, relative: bool, check_existence: bool):
192 """
193 :param path:
194 :param relative: If true, the returned path will be relative the project's top-level directory.
195 :param check_existence: if True, will raise an error when file does not exist
196 :return: the adjusted path, either absolute or relative
197 """
198 path = os.path.abspath(path)
199 if check_existence and not os.path.exists(path):
200 raise FileNotFoundError(f"No such file: {path}")
201 if relative:
202 return str(Path(path).relative_to(self.config_directory))
203 return path
206class DefaultDataConfiguration(ConfigurationBase, ABC):
207 """
208 Reads default configuration entries and contains retrieval methods for a typical data-driven project.
209 A typical config.json file would look like this:
211 | {
212 | "data_raw": "data/raw",
213 | "data_cleaned": "data/cleaned",
214 | "data_processed": "data/processed",
215 | "data_ground_truth": "data/ground_truth",
216 | "visualizations": "data/visualizations",
217 | "artifacts": "data/artifacts",
218 | "temp": "temp",
219 | "data": "data"
220 | }
222 """
224 PROCESSED = "processed"
225 RAW = "raw"
226 CLEANED = "cleaned"
227 GROUND_TRUTH = "ground_truth"
228 DATA = "data"
230 @property
231 def artifacts(self):
232 return self._get_existing_path("artifacts")
234 @property
235 def visualizations(self):
236 return self._get_existing_path("visualizations")
238 @property
239 def temp(self):
240 return self._get_existing_path("temp")
242 @property
243 def data(self):
244 return self._get_existing_path("data")
246 @property
247 def data_raw(self):
248 return self._get_existing_path("data_raw")
250 @property
251 def data_cleaned(self):
252 return self._get_existing_path("data_cleaned")
254 @property
255 def data_processed(self):
256 return self._get_existing_path("data_processed")
258 @property
259 def data_ground_truth(self):
260 return self._get_existing_path("data_ground_truth")
262 def datafile_path(
263 self,
264 filename: str,
265 stage="raw",
266 relative=False,
267 check_existence=False,
268 ):
269 """
270 :param filename:
271 :param stage: raw, ground_truth, cleaned or processed
272 :param relative: If True, the returned path will be relative the project's top-level directory
273 :param check_existence: if True, will raise an error when file does not exist
274 """
275 basedir = self._data_basedir(stage)
276 full_path = os.path.join(basedir, filename)
277 return self._adjusted_path(full_path, relative, check_existence)
279 def _data_basedir(self, stage):
280 if stage == self.RAW:
281 basedir = self.data_raw
282 elif stage == self.CLEANED:
283 basedir = self.data_cleaned
284 elif stage == self.PROCESSED:
285 basedir = self.data_processed
286 elif stage == self.GROUND_TRUTH:
287 basedir = self.data_ground_truth
288 else:
289 raise KeyError(f"Unknown stage: {stage}")
290 return basedir
292 def artifact_path(self, name: str, relative=False, check_existence=False):
293 """
294 :param name:
295 :param relative: If true, the returned path will be relative the project's top-level directory.
296 :param check_existence: if True, will raise an error when file does not exist
297 :return:
298 """
299 full_path = os.path.join(self.artifacts, name)
300 return self._adjusted_path(full_path, relative, check_existence)
303ConfigurationClass = TypeVar("ConfigurationClass", bound=ConfigurationBase)
306class ConfigProviderBase(Generic[ConfigurationClass], ABC):
307 """
308 Class for providing a config-singleton. Should not be instantiated directly but instead subclassed with an
309 appropriate subclass of ConfigurationBase substituting the generic type.
311 Usage example:
312 >>> from accsr.config import ConfigurationBase, ConfigProviderBase
313 >>> class __MyConfigClass(ConfigurationBase):
314 ... pass
315 >>> class __MyConfigProvider(ConfigProviderBase[__MyConfigClass]):
316 ... pass
317 ...
318 >>> _config_provider = __MyConfigProvider()
319 ...
320 >>> def get_config():
321 ... return _config_provider.get_config()
322 """
324 def __init__(self):
325 self.__config_instance = None
326 self._config_args = None
327 self._config_kwargs = None
328 # retrieving the generic type at runtime, see
329 # https://stackoverflow.com/questions/48572831/how-to-access-the-type-arguments-of-typing-generic
330 self._config_constructor: Type[ConfigurationClass] = get_args(
331 self.__class__.__orig_bases__[0]
332 )[0]
334 def _should_update_config_instance(self, reload: bool, args, kwargs):
335 return (
336 self.__config_instance is None
337 or reload
338 or self._config_args != args
339 or self._config_kwargs != kwargs
340 )
342 def get_config(self, reload=False, *args, **kwargs) -> ConfigurationClass:
343 """
344 Retrieves the configuration object (as singleton).
346 :param reload: if True, the config will be reloaded from disk even if a suitable
347 configuration object already exists. This is mainly useful in interactive environments like notebooks.
348 :param args: passed to init of the configuration class
349 :param kwargs: passed to init of the configuration class constructor
350 :return:
351 """
352 if self._should_update_config_instance(reload, args, kwargs):
353 self._config_args = args
354 self._config_kwargs = kwargs
355 self.__config_instance = self._config_constructor(*args, **kwargs)
356 return self.__config_instance