Source code for pytest_wdl.core

#    Copyright 2019 Eli Lilly and Company
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

from abc import ABCMeta, abstractmethod
import hashlib
import json
import os
from pathlib import Path
import re
import shutil
import tempfile
from typing import Callable, Dict, List, Optional, Pattern, Type, Union, cast

import delegator

from pytest_wdl.utils import (
    LOG, tempdir, ensure_path, plugin_factory_map, env_map,
    resolve_value_descriptor, download_file
)


ENV_CACHE_DIR = "PYTEST_WDL_CACHE_DIR"
KEY_CACHE_DIR = "cache_dir"
ENV_EXECUTION_DIR = "PYTEST_WDL_EXECUTION_DIR"
KEY_EXECUTION_DIR = "execution_dir"
KEY_PROXIES = "proxies"
KEY_HTTP_HEADERS = "http_headers"
KEY_SHOW_PROGRESS = "show_progress"
KEY_EXECUTORS = "executors"


[docs]class UserConfiguration: """ Stores pytest-wdl configuration. If configuration options are specified both in the config file and as arguments to the constructor, the latter take precedence. Args: config_file: JSON file from which to load default values. cache_dir: The directory in which to cache localized files; defaults to using a temporary directory that is specific to each module and deleted afterwards. remove_cache_dir: Whether to remove the cache directory; if None, takes the value True if a temp directory is used for caching, and False, if a value for `cache_dir` is specified. execution_dir: The directory in which to run workflows. Defaults to None, which signals that a different temporary directory should be used for each workflow run. proxies: Mapping of proxy type (typically 'http' or 'https' to either an environment variable, or a dict with either/both keys 'env' and 'value', where the value is taken from the environment variable ('env') first, and from 'value' if the environment variable is not specified or is unset. http_headers: A list of dicts, each of which defines a header. The allowed keys are 'pattern', 'name', 'env', and 'value', where pattern is a URL pattern to match, 'name' is the header name and 'env' and 'value' are interpreted the same as for `proxies`. If no pattern is provided, the header is used for all URLs. show_progress: Whether to show progress bars when downloading remote test data files. executor_defaults: Mapping of executor name to dict of executor-specific configuration options. """ def __init__( self, config_file: Optional[Path] = None, cache_dir: Optional[Path] = None, remove_cache_dir: Optional[bool] = None, execution_dir: Optional[Path] = None, proxies: Optional[Dict[str, Union[str, Dict[str, str]]]] = None, http_headers: Optional[List[dict]] = None, show_progress: Optional[bool] = None, executor_defaults: Optional[Dict[str, dict]] = None, ): if config_file: with open(config_file, "rt") as inp: defaults = json.load(inp) else: defaults = {} if not cache_dir: cache_dir_str = os.environ.get(ENV_CACHE_DIR, defaults.get(KEY_CACHE_DIR)) if cache_dir_str: cache_dir = ensure_path(cache_dir_str) if cache_dir: self.cache_dir = ensure_path(cache_dir, is_file=False, create=True) if remove_cache_dir is None: remove_cache_dir = False else: self.cache_dir = Path(tempfile.mkdtemp()) if remove_cache_dir is None: remove_cache_dir = True self.remove_cache_dir = remove_cache_dir if not execution_dir: execution_dir_str = os.environ.get( ENV_EXECUTION_DIR, defaults.get(KEY_EXECUTION_DIR) ) if execution_dir_str: execution_dir = ensure_path(execution_dir_str) if execution_dir: self.default_execution_dir = ensure_path( execution_dir, is_file=False, create=True ) else: self.default_execution_dir = None if not proxies and KEY_PROXIES in defaults: proxies = env_map(defaults[KEY_PROXIES]) self.proxies = proxies or {} if not http_headers and KEY_HTTP_HEADERS in defaults: http_headers = defaults[KEY_HTTP_HEADERS] for d in http_headers: if "pattern" in d: d["pattern"] = re.compile(d.pop("pattern")) self.default_http_headers = http_headers or [] self.show_progress = show_progress if self.show_progress is None: self.show_progress = defaults.get(KEY_SHOW_PROGRESS) self.executor_defaults = executor_defaults or {} if "executors" in defaults: for name, d in defaults["executors"].items(): if name not in self.executor_defaults: self.executor_defaults[name] = d
[docs] def get_executor_defaults(self, executor_name: str) -> dict: """ Get default configuration values for the given executor. Args: executor_name: The executor name Returns: A dict with the executor configuration values, if any. """ return self.executor_defaults.get(executor_name, {})
[docs] def cleanup(self) -> None: """ Preforms cleanup operations, such as deleting the cache directory if `self.remove_cache_dir` is True. """ if self.remove_cache_dir: shutil.rmtree(self.cache_dir)
[docs]class Localizer(metaclass=ABCMeta): # pragma: no-cover """ Abstract base of classes that implement file localization. """
[docs] @abstractmethod def localize(self, destination: Path) -> None: """ Localize a resource to `destination`. Args: destination: Path to file where the non-local resource is to be localized. """ pass
[docs]class UrlLocalizer(Localizer): """ Localizes a file specified by a URL. """ def __init__( self, url: str, user_config: UserConfiguration, http_headers: Optional[dict] = None ): self.url = url self.user_config = user_config self._http_headers = http_headers
[docs] def localize(self, destination: Path): try: download_file( self.url, destination, http_headers=self.http_headers, proxies=self.user_config.proxies, show_progress=self.user_config.show_progress ) except Exception as err: raise RuntimeError(f"Error localizing url {self.url}") from err
@property def http_headers(self) -> dict: http_headers = {} if self._http_headers: http_headers.update(env_map(self._http_headers)) if self.user_config.default_http_headers: for value_dict in self.user_config.default_http_headers: name = value_dict["name"] pattern = value_dict.get("pattern") if name not in http_headers and ( pattern is None or pattern.match(self.url) ): value = resolve_value_descriptor(value_dict) if value: http_headers[name] = value return http_headers @property def proxies(self) -> dict: return self.user_config.proxies
[docs]class StringLocalizer(Localizer): """ Localizes a string by writing it to a file. """ def __init__(self, contents: str): self.contents = contents
[docs] def localize(self, destination: Path): LOG.debug(f"Persisting {destination} from contents") with open(destination, "wt") as out: out.write(self.contents)
[docs]class LinkLocalizer(Localizer): """ Localizes a file to another destination using a symlink. """ def __init__(self, source: Path): self.source = source
[docs] def localize(self, destination: Path): destination.symlink_to(self.source)
[docs]class DataFile: """ A data file, which may be local, remote, or represented as a string. Args: local_path: Path where the data file should exist after being localized. localizer: Localizer object, for persisting the file on the local disk. allowed_diff_lines: Number of lines by which the file is allowed to differ from another and still be considered equal. """ def __init__( self, local_path: Path, localizer: Optional[Localizer] = None, allowed_diff_lines: Optional[int] = 0 ): if localizer is None and not local_path.exists(): raise ValueError( f"Local path {local_path} does not exist and 'localizer' is None" ) self.local_path = local_path self.localizer = localizer self.allowed_diff_lines = allowed_diff_lines or 0 @property def path(self) -> Path: if not self.local_path.exists(): self.localizer.localize(self.local_path) return self.local_path def __str__(self) -> str: return str(self.local_path)
[docs] def assert_contents_equal(self, other: Union[str, Path, "DataFile"]) -> None: """ Assert the contents of two files are equal. If `allowed_diff_lines == 0`, files are compared using MD5 hashes, otherwise their contents are compared using the linux `diff` command. Args: other: A `DataFile` or string file path. Raises: AssertionError if the files are different. """ allowed_diff_lines = self.allowed_diff_lines if isinstance(other, Path): other_path = other elif isinstance(other, str): other_path = Path(other) else: other_path = other.path allowed_diff_lines = max(allowed_diff_lines, other.allowed_diff_lines) self._assert_contents_equal(self.path, other_path, allowed_diff_lines)
@classmethod def _assert_contents_equal( cls, file1: Path, file2: Path, allowed_diff_lines: int ) -> None: if allowed_diff_lines: cls._diff_contents(file1, file2, allowed_diff_lines) else: cls._compare_hashes(file1, file2) @classmethod def _diff_contents(cls, file1: Path, file2: Path, allowed_diff_lines: int) -> None: if file1.suffix == ".gz": with tempdir() as temp: temp_file1 = temp / "file1" temp_file2 = temp / "file2" delegator.run(f"gunzip -c {file1} > {temp_file1}", block=True) delegator.run(f"gunzip -c {file2} > {temp_file2}", block=True) diff_lines = cls._diff(temp_file1, temp_file2) else: diff_lines = cls._diff(file1, file2) if diff_lines > allowed_diff_lines: raise AssertionError( f"{diff_lines} lines (which is > {allowed_diff_lines} allowed) are " f"different between files {file1}, {file2}" ) @classmethod def _diff(cls, file1: Path, file2: Path) -> int: cmd = f"diff -y --suppress-common-lines {file1} {file2} | grep '^' | wc -l" return int(delegator.run(cmd, block=True).out) @classmethod def _compare_hashes(cls, file1: Path, file2: Path) -> None: with open(file1, "rb") as inp1: file1_md5 = hashlib.md5(inp1.read()).hexdigest() with open(file2, "rb") as inp2: file2_md5 = hashlib.md5(inp2.read()).hexdigest() if file1_md5 != file2_md5: raise AssertionError( f"MD5 hashes differ between expected identical files " f"{file1}, {file2}" )
DATA_TYPES = plugin_factory_map(DataFile, "pytest_wdl.data_types") """Data type plugin modules from the discovered entry points."""
[docs]class DataDirs: """ Provides data files from test data directory structure as defined by the datadir and datadir-ng plugins. Paths are resolved lazily upon first request. """ def __init__( self, basedir: Path, module, # TODO: no Module type in typelib yet function: Callable, cls: Optional[Type] = None ): module_path = module.__name__.split(".") if len(module_path) > 1: for mod in reversed(module_path[:-1]): if basedir.name == mod: basedir = basedir.parent else: raise RuntimeError( f"Module path {module_path} does not match basedir {basedir}" ) self.basedir = basedir self.module = os.path.join(*module_path) self.function = function.__name__ self.cls = cls.__name__ if cls else None self._paths = None @property def paths(self) -> List[Path]: if self._paths is None: def add_datadir_paths(root: Path): testdir = root / self.module if testdir.exists(): if self.cls is not None: clsdir = testdir / self.cls if clsdir.exists(): fndir = clsdir / self.function if fndir.exists(): self._paths.append(fndir) self._paths.append(clsdir) else: fndir = testdir / self.function if fndir.exists(): self._paths.append(fndir) self._paths.append(testdir) self._paths = [] add_datadir_paths(self.basedir) data_root = self.basedir / "data" if data_root.exists(): add_datadir_paths(data_root) self._paths.append(data_root) return self._paths
[docs]class DataResolver: """ Resolves data files that may need to be localized. """ def __init__(self, data_descriptors: dict, user_config: UserConfiguration): self.data_descriptors = data_descriptors self.user_config = user_config
[docs] def resolve( self, name: str, datadirs: Optional[DataDirs] = None ) -> DataFile: if name not in self.data_descriptors: raise ValueError(f"Unrecognized name {name}") value = self.data_descriptors[name] if isinstance(value, dict): return self.create_data_file(datadirs=datadirs, **cast(dict, value)) else: return value
[docs] def create_data_file( self, type: Optional[str] = "default", name: Optional[str] = None, path: Optional[str] = None, url: Optional[str] = None, contents: Optional[str] = None, env: Optional[str] = None, datadirs: Optional[DataDirs] = None, http_headers: Optional[dict] = None, **kwargs ) -> DataFile: data_file_class = DATA_TYPES.get(type, DataFile) local_path = None localizer = None if path: local_path = ensure_path(path, [self.user_config.cache_dir]) if local_path and local_path.exists(): pass elif env and env in os.environ: env_path = ensure_path(os.environ[env], exists=True) if not local_path: local_path = env_path else: localizer = LinkLocalizer(env_path) elif url: localizer = UrlLocalizer(url, self.user_config, http_headers) if not local_path: if name: local_path = ensure_path(self.user_config.cache_dir / name) else: filename = url.rsplit("/", 1)[1] local_path = ensure_path(self.user_config.cache_dir / filename) elif contents: localizer = StringLocalizer(contents) if not local_path: if name: local_path = ensure_path(self.user_config.cache_dir / name) else: local_path = ensure_path( tempfile.mktemp(dir=self.user_config.cache_dir) ) elif name and datadirs: for dd in datadirs.paths: dd_path = dd / name if dd_path.exists(): break else: raise FileNotFoundError( f"File {name} not found in any of the following datadirs: " f"{datadirs.paths}" ) if not local_path: local_path = dd_path else: localizer = LinkLocalizer(dd_path) else: raise FileNotFoundError( f"File {path or name} does not exist. Either a url, file contents, " f"or a local file must be provided." ) return data_file_class(local_path, localizer, **kwargs)
[docs]class DataManager: """ Manages test data, which is defined in a test_data.json file. Args: data_resolver: Module-level config. datadirs: Data directories to search for the data file. """ def __init__(self, data_resolver: DataResolver, datadirs: DataDirs): self.data_resolver = data_resolver self.datadirs = datadirs def __getitem__(self, name: str): return self.data_resolver.resolve(name, self.datadirs)
[docs] def get_dict(self, *names: str, **params) -> dict: """ Creates a dict with one or more entries from this DataManager. Args: *names: Names of test data entries to add to the dict. **params: Mapping of workflow parameter names to test data entry names. Returns: Dict mapping parameter names to test data entries for all specified names. """ d = {} for name in names: d[name] = self[name] for param, name in params.items(): d[param] = self[name] return d
[docs]class Executor(metaclass=ABCMeta): """ Base class for WDL workflow executors. """
[docs] @abstractmethod def run_workflow( self, wdl_script: Union[str, Path], workflow_name: Optional[str] = None, inputs: Optional[dict] = None, expected: Optional[dict] = None, **kwargs ) -> dict: """ Run a WDL workflow on given inputs, and check that the output matches given expected values. Args: wdl_script: The WDL script to execute. workflow_name: The name of the workflow in the WDL script. If None, the name of the WDL script is used (without the .wdl extension). inputs: Object that will be serialized to JSON and provided to Cromwell as the workflow inputs. expected: Dict mapping output parameter names to expected values. kwargs: Additional keyword arguments, mostly for debugging: * execution_dir: DEPRECATED * inputs_file: Path to the Cromwell inputs file to use. Inputs are written to this file only if it doesn't exist. * imports_file: Path to the WDL imports file to use. Imports are written to this file only if it doesn't exist. * java_args: Additional arguments to pass to Java runtime. * cromwell_args: Additional arguments to pass to `cromwell run`. Returns: Dict of outputs. Raises: Exception: if there was an error executing the workflow AssertionError: if the actual outputs don't match the expected outputs """
EXECUTORS = plugin_factory_map(Executor, "pytest_wdl.executors") """Executor plugin modules from the discovered entry points."""