Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Contains helpers for defining and providing configuration classes. A typical usage would be to create the files 

3*config.py*, *config.json* and *config_local.json* in a project's root directory. An example of a config.py for a 

4data-driven project is below. For a non-data driven project, the configuration class should inherit from 

5``accsr.config.ConfigurationBase``, and the resulting class will not have any pre-populated public entries. 

6 

7>>> from accsr.config import DefaultDataConfiguration, ConfigProviderBase 

8>>> 

9>>> class __Configuration(DefaultDataConfiguration): 

10... @property 

11... def custom_entry_from_config(self): 

12... return self._get_non_empty_entry("custom_entry_from_config") 

13... 

14... @property 

15... def existing_path_in_base_dir(self): 

16... return self._get_existing_path(["base_dir", "path_in_base_dir"]) 

17... 

18... @property 

19... def custom_path_in_processed_data(self): 

20... return self.datafile_path("my_data", stage=self.PROCESSED, check_existence=False) 

21>>> 

22>>> class ConfigProvider(ConfigProviderBase[__Configuration]): 

23... pass 

24>>> 

25>>> _config_provider = ConfigProvider() 

26>>> 

27>>> 

28>>> def get_config(reload=False): 

29... return _config_provider.get_config(reload=reload) 

30 

31""" 

32 

33import inspect 

34import json 

35import logging.handlers 

36import os 

37from abc import ABC 

38from copy import deepcopy 

39from pathlib import Path 

40from typing import Callable, Dict, Generic, List, TextIO, Type, TypeVar, Union, get_args 

41 

42log = logging.getLogger(__name__) 

43 

44 

45def recursive_dict_update(d: Dict, u: Dict): 

46 """ 

47 Modifies d inplace by overwriting with non-dict values from u and updating all dict-values recursively. 

48 Returns the modified d. 

49 """ 

50 # From https://stackoverflow.com/a/3233356/1069467 

51 for k, v in u.items(): 

52 if isinstance(v, dict): 

53 d[k] = recursive_dict_update(d.get(k, {}), v) 

54 else: 

55 d[k] = v 

56 return d 

57 

58 

59def _replace_env_vars(conf: Union[dict], env_var_marker="env:"): 

60 for k, v in conf.items(): 

61 if isinstance(v, str) and v.startswith(env_var_marker): 

62 env_var_name = v.lstrip(env_var_marker) 

63 conf[k] = os.getenv(env_var_name) 

64 elif isinstance(v, dict): 

65 _replace_env_vars(v, env_var_marker=env_var_marker) 

66 

67 

68def _get_entry_with_replaced_env_vars( 

69 entry: Union[str, float, list, dict], env_var_marker="env:" 

70): 

71 entry = deepcopy(entry) 

72 if isinstance(entry, str) and entry.startswith(env_var_marker): 

73 env_var_name = entry.lstrip(env_var_marker) 

74 return os.getenv(env_var_name) 

75 if isinstance(entry, dict): 

76 _replace_env_vars(entry, env_var_marker=env_var_marker) 

77 return entry 

78 if isinstance(entry, list): 

79 return [_get_entry_with_replaced_env_vars(v) for v in entry] 

80 return entry 

81 

82 

83def get_config_reader(filename: str) -> Callable[[TextIO], Dict]: 

84 """ 

85 Returns a reader for yaml or json files. The file type is determined by the file extension. 

86 """ 

87 if filename.endswith(".yaml") or filename.endswith(".yml"): 

88 import yaml 

89 

90 return yaml.safe_load 

91 elif filename.endswith(".json"): 

92 return json.load 

93 raise ValueError( 

94 f"Unsupported file type for {filename}. Supported are .yaml, .yml and .json." 

95 ) 

96 

97 

98class ConfigurationBase(ABC): 

99 """ 

100 Base class for reading and retrieving configuration entries. Do not instantiate this class directly but 

101 instead inherit from it. 

102 """ 

103 

104 ENV_VAR_MARKER = "env:" 

105 

106 def __init__( 

107 self, 

108 config_directory: str = None, 

109 config_files=("config.json", "config_local.json"), 

110 ): 

111 """ 

112 :param config_directory: directory where to look for the config files. Typically, this will be a project's 

113 root directory. If None, the directory with the module containing the configuration class definition 

114 (inherited from ConfigurationBase) will be used. 

115 :param config_files: list of JSON or YAML configuration files (relative to config_directory) from which to read. 

116 The filenames should end in .json or .yaml/.yml. 

117 The configurations will be merged (dicts are merged, everything else is overwritten), 

118 entries more to the right have precedence. 

119 Non-existing files from the list will be ignored without errors or warnings. However, at least 

120 one file must exist for configuration to be read. 

121 """ 

122 self.config_directory = ( 

123 config_directory 

124 if config_directory is not None 

125 else self._module_dir_path() 

126 ) 

127 self.config = {} 

128 for filename in config_files: 

129 file_path = os.path.join(self.config_directory, filename) 

130 file_reader = get_config_reader(filename) 

131 if os.path.exists(file_path): 

132 log.info(f"Reading configuration from {file_path}") 

133 with open(file_path, "r") as f: 

134 read_config = file_reader(f) 

135 recursive_dict_update(self.config, read_config) 

136 if not self.config: 

137 raise FileNotFoundError( 

138 "No configuration entries could be read from" 

139 f"{[os.path.join(self.config_directory, c) for c in config_files]}" 

140 ) 

141 

142 def _module_dir_path(self): 

143 module_path = os.path.abspath(inspect.getfile(self.__class__)) 

144 return os.path.dirname(module_path) 

145 

146 def _get_non_empty_entry( 

147 self, key: Union[str, List[str]] 

148 ) -> Union[float, str, List, Dict]: 

149 """ 

150 Retrieves an entry from the configuration 

151 

152 :param key: key or list of keys to go through hierarchically 

153 :return: the queried json object 

154 """ 

155 if isinstance(key, str): 

156 key = [key] 

157 value = self.config 

158 for k in key: 

159 value = value.get(k) 

160 if value is None: 

161 raise KeyError(f"Value for key '{key}' not set in configuration") 

162 return _get_entry_with_replaced_env_vars(value) 

163 

164 def _get_existing_path(self, key: Union[str, List[str]], create=True) -> str: 

165 """ 

166 Retrieves an existing local path from the configuration 

167 

168 :param key: key or list of keys to go through hierarchically 

169 :param create: if True, a directory with the given path will be created on the fly. 

170 :return: the queried path 

171 """ 

172 path_string = self._get_non_empty_entry(key) 

173 if os.path.isabs(path_string): 

174 path = path_string 

175 else: 

176 path = os.path.abspath(os.path.join(self.config_directory, path_string)) 

177 if not os.path.exists(path): 

178 if isinstance(key, list): 

179 key = ".".join(key) # purely for logging 

180 if create: 

181 log.info( 

182 f"Configured directory {key}='{path}' not found; will create it" 

183 ) 

184 os.makedirs(path) 

185 else: 

186 raise FileNotFoundError( 

187 f"Configured directory {key}='{path}' does not exist." 

188 ) 

189 return path.replace("/", os.sep) 

190 

191 def _adjusted_path(self, path: str, relative: bool, check_existence: bool): 

192 """ 

193 :param path: 

194 :param relative: If true, the returned path will be relative the project's top-level directory. 

195 :param check_existence: if True, will raise an error when file does not exist 

196 :return: the adjusted path, either absolute or relative 

197 """ 

198 path = os.path.abspath(path) 

199 if check_existence and not os.path.exists(path): 

200 raise FileNotFoundError(f"No such file: {path}") 

201 if relative: 

202 return str(Path(path).relative_to(self.config_directory)) 

203 return path 

204 

205 

206class DefaultDataConfiguration(ConfigurationBase, ABC): 

207 """ 

208 Reads default configuration entries and contains retrieval methods for a typical data-driven project. 

209 A typical config.json file would look like this: 

210 

211 | { 

212 | "data_raw": "data/raw", 

213 | "data_cleaned": "data/cleaned", 

214 | "data_processed": "data/processed", 

215 | "data_ground_truth": "data/ground_truth", 

216 | "visualizations": "data/visualizations", 

217 | "artifacts": "data/artifacts", 

218 | "temp": "temp", 

219 | "data": "data" 

220 | } 

221 

222 """ 

223 

224 PROCESSED = "processed" 

225 RAW = "raw" 

226 CLEANED = "cleaned" 

227 GROUND_TRUTH = "ground_truth" 

228 DATA = "data" 

229 

230 @property 

231 def artifacts(self): 

232 return self._get_existing_path("artifacts") 

233 

234 @property 

235 def visualizations(self): 

236 return self._get_existing_path("visualizations") 

237 

238 @property 

239 def temp(self): 

240 return self._get_existing_path("temp") 

241 

242 @property 

243 def data(self): 

244 return self._get_existing_path("data") 

245 

246 @property 

247 def data_raw(self): 

248 return self._get_existing_path("data_raw") 

249 

250 @property 

251 def data_cleaned(self): 

252 return self._get_existing_path("data_cleaned") 

253 

254 @property 

255 def data_processed(self): 

256 return self._get_existing_path("data_processed") 

257 

258 @property 

259 def data_ground_truth(self): 

260 return self._get_existing_path("data_ground_truth") 

261 

262 def datafile_path( 

263 self, 

264 filename: str, 

265 stage="raw", 

266 relative=False, 

267 check_existence=False, 

268 ): 

269 """ 

270 :param filename: 

271 :param stage: raw, ground_truth, cleaned or processed 

272 :param relative: If True, the returned path will be relative the project's top-level directory 

273 :param check_existence: if True, will raise an error when file does not exist 

274 """ 

275 basedir = self._data_basedir(stage) 

276 full_path = os.path.join(basedir, filename) 

277 return self._adjusted_path(full_path, relative, check_existence) 

278 

279 def _data_basedir(self, stage): 

280 if stage == self.RAW: 

281 basedir = self.data_raw 

282 elif stage == self.CLEANED: 

283 basedir = self.data_cleaned 

284 elif stage == self.PROCESSED: 

285 basedir = self.data_processed 

286 elif stage == self.GROUND_TRUTH: 

287 basedir = self.data_ground_truth 

288 else: 

289 raise KeyError(f"Unknown stage: {stage}") 

290 return basedir 

291 

292 def artifact_path(self, name: str, relative=False, check_existence=False): 

293 """ 

294 :param name: 

295 :param relative: If true, the returned path will be relative the project's top-level directory. 

296 :param check_existence: if True, will raise an error when file does not exist 

297 :return: 

298 """ 

299 full_path = os.path.join(self.artifacts, name) 

300 return self._adjusted_path(full_path, relative, check_existence) 

301 

302 

303ConfigurationClass = TypeVar("ConfigurationClass", bound=ConfigurationBase) 

304 

305 

306class ConfigProviderBase(Generic[ConfigurationClass], ABC): 

307 """ 

308 Class for providing a config-singleton. Should not be instantiated directly but instead subclassed with an 

309 appropriate subclass of ConfigurationBase substituting the generic type. 

310 

311 Usage example: 

312 >>> from accsr.config import ConfigurationBase, ConfigProviderBase 

313 >>> class __MyConfigClass(ConfigurationBase): 

314 ... pass 

315 >>> class __MyConfigProvider(ConfigProviderBase[__MyConfigClass]): 

316 ... pass 

317 ... 

318 >>> _config_provider = __MyConfigProvider() 

319 ... 

320 >>> def get_config(): 

321 ... return _config_provider.get_config() 

322 """ 

323 

324 def __init__(self): 

325 self.__config_instance = None 

326 self._config_args = None 

327 self._config_kwargs = None 

328 # retrieving the generic type at runtime, see 

329 # https://stackoverflow.com/questions/48572831/how-to-access-the-type-arguments-of-typing-generic 

330 self._config_constructor: Type[ConfigurationClass] = get_args( 

331 self.__class__.__orig_bases__[0] 

332 )[0] 

333 

334 def _should_update_config_instance(self, reload: bool, args, kwargs): 

335 return ( 

336 self.__config_instance is None 

337 or reload 

338 or self._config_args != args 

339 or self._config_kwargs != kwargs 

340 ) 

341 

342 def get_config(self, reload=False, *args, **kwargs) -> ConfigurationClass: 

343 """ 

344 Retrieves the configuration object (as singleton). 

345 

346 :param reload: if True, the config will be reloaded from disk even if a suitable 

347 configuration object already exists. This is mainly useful in interactive environments like notebooks. 

348 :param args: passed to init of the configuration class 

349 :param kwargs: passed to init of the configuration class constructor 

350 :return: 

351 """ 

352 if self._should_update_config_instance(reload, args, kwargs): 

353 self._config_args = args 

354 self._config_kwargs = kwargs 

355 self.__config_instance = self._config_constructor(*args, **kwargs) 

356 return self.__config_instance