Source code for eradiate.data._core

from __future__ import annotations

import os
import typing as t
import warnings
from abc import ABC, abstractmethod
from pathlib import Path

import attrs
import pooch
import tqdm
from ruamel.yaml import YAML

from ..typing import PathLike


[docs] @attrs.define class DataStore(ABC): """ Interface class for all data stores. """ @property @abstractmethod def base_url(self) -> str: """ str : Address of the remote storage location. """ pass @property @abstractmethod def registry(self) -> dict: """ dict : Registry contents. """ pass
[docs] @abstractmethod def registry_files( self, filter: t.Callable[[t.Any], bool] | None = None ) -> list[str]: """ Get a list of registered files. Parameters ---------- filter : callable, optional A filter function taking a file path as a single string argument and returning a boolean. Filenames for which the filter returns ``True`` will be returned. Returns ------- files : list of str List of registered files. """ pass
[docs] @abstractmethod def fetch( self, filename: PathLike, **kwargs, ) -> Path: """ Fetch a file from the data store. Parameters ---------- filename : path-like File name to fetch from the data store, relative to the storage root. Returns ------- Path Absolute path where the retrieved resource is located. Raises ------ DataError The requested file could not be served. """ pass
def registry_from_file(filename: PathLike, warn: bool = True) -> dict: """ Read registry content from a file. Parameters ---------- filename : path-like Path to the file to be read. warn : bool, optional If ``True``, ill-formed lines will result in a warning. Otherwise, an exception will be raised. Returns ------- registry : dict Registry contents, parsed as a dictionary. Raises ------ ValueError If `warn` is ``False`` and an error occurs while parsing the file. """ result = {} with open(filename, "r") as f: for i, line in enumerate(f): line = line.strip() # Skip empty lines if not line: continue # Skip comment lines if line.startswith("#"): continue items = line.split(maxsplit=1) # No metadata (just the filename) if len(items) == 1: result[items[0]] = "" continue # Found metadata if len(items) == 2: result[items[0]] = items[1] continue # Ill-formed line if len(items) > 2: if warn: warnings.warn( f"While parsing registry file {filename}: skipping " f"ill-formed line {i}" ) continue else: raise ValueError( f"While parsing registry file {filename}: " f"ill-formed line {i}" ) return result def registry_to_file(registry: dict, filename: PathLike) -> None: """ Write a registry dictionary to a text file. Parameters ---------- registry : dict A registry dictionary. filename : path-like Path to the file where registry contents are to be written. """ lines = [f"{path} {registry[path]}".strip() for path in sorted(registry)] lines.append("") content = "\n".join(lines) with open(filename, "w") as f: f.write(content) def load_rules(filename: PathLike) -> dict: """ Load include and exclude rules from a YAML file. Parameters ---------- filename : path-like Path to the YAML file from which rules are to be loaded. Returns ------- rules : dict Dictionary containing a list of inclusion (resp. exclusion) rules under the ``"include"`` (resp. ``"exclude"``) key. """ yaml = YAML() try: with open(filename) as f: rules = yaml.load(f) except OSError: rules = {"include": ["**/*.*"], "exclude": []} return rules def expand_rules( rules: list[str], prefix: PathLike = ".", as_list: bool = False, include_dirs=False, ) -> list | set: """ Expand a list of filesystem selection rules to paths. Parameters ---------- rules : list of str List of inclusion to expand, relative to `prefix`. Each rule may be a path to a file or a shell glob. prefix : path-like, optional Path where to expand the rules. By default, the current working directory (``"."``) is used. as_list : bool, optional If ``True``, return the result as a sorted list; otherwise, return it as a set. include_dirs : bool, optional If ``True``, include directories in the list of expanded items. Otherwise, the expansion is restricted to files. Returns ------- items : set or list Items corresponding to the expanded rules. """ expanded = set() for rule in rules: expanded |= set( x for x in Path(prefix).rglob(rule) if (x.is_file() or include_dirs) ) return sorted(expanded) if as_list else expanded def list_files( path: PathLike, includes: list[str] | None = None, excludes: list[str] | None = None, as_list: bool = False, ) -> list | set: """ List files in a directory based on inclusion and exclusion rules. Parameters ---------- path : path-like Path to the target directory. includes : list of str, optional List of inclusion rules, relative to `path`. Each rule may be a path to a file or a shell glob. If no rule is passed, everything is included (*i.e.* ``["**/*"]`` is used). excludes : list of str, optional List of exclusion rules, relative to `path`. Each rule may be a path to a file or a shell glob. If no rule is passed, nothing is excluded (*i.e.* ``[]`` is used). as_list : bool If ``True``, return the result as a sorted list; otherwise, return it as a set. Returns ------- items : set or list Files listed in the target directory, according to the rules. """ if includes is None: includes = ["**/*"] if excludes is None: excludes = [] included: set = expand_rules(includes, prefix=path) excluded: set = expand_rules(excludes, prefix=path) return sorted(included - excluded) if as_list else included - excluded def make_registry( path: PathLike, filename: PathLike = "registry.txt", includes: list | None = None, excludes: list | None = None, alg="sha256", show_progress=False, ) -> None: """ Create a registry file from items in a directory, possibly applying inclusion and exclusion rules. Parameters ---------- path : path-like Path to the target directory. filename : path-like Path to the file where registry contents are to be written. includes : list of str, optional List of inclusion rules, relative to `path`. Each rule may be a path to a file or a shell glob. If no rule is passed, everything is included (*i.e.* ``["**/*"]`` is used). excludes : list of str, optional List of exclusion rules, relative to `path`. Each rule may be a path to a file or a shell glob. If no rule is passed, nothing is excluded (*i.e.* ``[]`` is used). alg : str, optional Hash algorithm used. """ # Basically pooch.make_registry() with inclusion / exclusion rules files = list_files(path, includes, excludes) hashes = ( [pooch.file_hash(x, alg=alg) for x in files] if alg is not None else ["" for _ in files] ) registry = {} with tqdm.tqdm(total=len(hashes), disable=not show_progress) as pbar: for file, hash in zip(files, hashes): registry[os.path.relpath(file, path)] = f"{alg}:{hash}" pbar.update() registry_to_file(registry, filename)