formatting.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
This script provides a more program-friendly representation of HPO search space.
The format is considered internal helper and is not visible to end users.

You will find this useful when you want to support nested search space.

The random tuner is an intuitive example for this utility.
You should check its code before reading docstrings in this file.
"""

__all__ = [
    'ParameterSpec',
    'deformat_parameters',
    'format_parameters',
    'format_search_space',
]

import math
from types import SimpleNamespace
from typing import Any, List, NamedTuple, Optional, Tuple

import numpy as np

class ParameterSpec(NamedTuple):
    """
    Specification (aka space / range / domain) of one single parameter.

    NOTE: For `loguniform` (and `qloguniform`), the fields `low` and `high` are logarithm of original values.
    """

    name: str                       # The object key in JSON
    type: str                       # "_type" in JSON
    values: List[Any]               # "_value" in JSON

    key: Tuple[str]                 # The "path" of this parameter

    categorical: bool               # Whether this paramter is categorical (unordered) or numerical (ordered)
    size: int = None                # If it's categorical, how many candidates it has

    # uniform distributed
    low: float = None               # Lower bound of uniform parameter
    high: float = None              # Upper bound of uniform parameter

    normal_distributed: bool = None # Whether this parameter is uniform or normal distrubuted
    mu: float = None                # µ of normal parameter
    sigma: float = None             # σ of normal parameter

    q: Optional[float] = None       # If not `None`, the parameter value should be an integer multiple of this
    clip: Optional[Tuple[float, float]] = None
                                    # For q(log)uniform, this equals to "values[:2]"; for others this is None

    log_distributed: bool = None    # Whether this parameter is log distributed
                                    # When true, low/high/mu/sigma describes log of parameter value (like np.lognormal)

    def is_activated_in(self, partial_parameters):
        """
        For nested search space, check whether this parameter should be skipped for current set of paremters.
        This function must be used in a pattern similar to random tuner. Otherwise it will misbehave.
        """
        if len(self.key) < 2 or isinstance(self.key[-2], str):
            return True
        return partial_parameters[self.key[:-2]] == self.key[-2]

def format_search_space(search_space):
    """
    Convert user provided search space into a dict of ParameterSpec.
    The dict key is dict value's `ParameterSpec.key`.
    """
    formatted = _format_search_space(tuple(), search_space)
    # In CPython 3.6, dicts preserve order by internal implementation.
    # In Python 3.7+, dicts preserve order by language spec.
    # Python 3.6 is crappy enough. Don't bother to do extra work for it.
    # Remove these comments when we drop 3.6 support.
    return {spec.key: spec for spec in formatted}

def deformat_parameters(formatted_parameters, formatted_search_space):
    """
    Convert internal format parameters to users' expected format.

    "test/ut/sdk/test_hpo_formatting.py" provides examples of how this works.

    The function do following jobs:
     1. For "choice" and "randint", convert index (integer) to corresponding value.
     2. For "*log*", convert x to `exp(x)`.
     3. For "q*", convert x to `round(x / q) * q`, then clip into range.
     4. For nested choices, convert flatten key-value pairs into nested structure.
    """
    ret = {}
    for key, x in formatted_parameters.items():
        spec = formatted_search_space[key]
        if spec.categorical:
            if spec.type == 'randint':
                lower = min(math.ceil(float(x)) for x in spec.values)
                _assign(ret, key, int(lower + x))
            elif _is_nested_choices(spec.values):
                _assign(ret, tuple([*key, '_name']), spec.values[x]['_name'])
            else:
                _assign(ret, key, spec.values[x])
        else:
            if spec.log_distributed:
                x = math.exp(x)
            if spec.q is not None:
                x = round(x / spec.q) * spec.q
            if spec.clip:
                x = max(x, spec.clip[0])
                x = min(x, spec.clip[1])
            if isinstance(x, np.number):
                x = x.item()
            _assign(ret, key, x)
    return ret

def format_parameters(parameters, formatted_search_space):
    """
    Convert end users' parameter format back to internal format, mainly for resuming experiments.

    The result is not accurate for "q*" and for "choice" that have duplicate candidates.
    """
    # I don't like this function. It's better to use checkpoint for resuming.
    ret = {}
    for key, spec in formatted_search_space.items():
        if not spec.is_activated_in(ret):
            continue
        value = parameters
        for name in key:
            if isinstance(name, str):
                value = value[name]
        if spec.categorical:
            if spec.type == 'randint':
                lower = min(math.ceil(float(x)) for x in spec.values)
                ret[key] = value - lower
            elif _is_nested_choices(spec.values):
                names = [nested['_name'] for nested in spec.values]
                ret[key] = names.index(value['_name'])
            else:
                ret[key] = spec.values.index(value)
        else:
            if spec.log_distributed:
                value = math.log(value)
            ret[key] = value
    return ret

def _format_search_space(parent_key, space):
    formatted = []
    for name, spec in space.items():
        if name == '_name':
            continue
        key = tuple([*parent_key, name])
        formatted.append(_format_parameter(key, spec['_type'], spec['_value']))
        if spec['_type'] == 'choice' and _is_nested_choices(spec['_value']):
            for index, sub_space in enumerate(spec['_value']):
                key = tuple([*parent_key, name, index])
                formatted += _format_search_space(key, sub_space)
    return formatted

def _format_parameter(key, type_, values):
    spec = SimpleNamespace(
        name = key[-1],
        type = type_,
        values = values,
        key = key,
        categorical = type_ in ['choice', 'randint'],
    )

    if spec.categorical:
        if type_ == 'choice':
            spec.size = len(values)
        else:
            lower = math.ceil(float(values[0]))
            upper = math.ceil(float(values[1]))
            spec.size = upper - lower

    else:
        if type_.startswith('q'):
            spec.q = float(values[2])
        else:
            spec.q = None
        spec.log_distributed = ('log' in type_)

        if 'normal' in type_:
            spec.normal_distributed = True
            spec.mu = float(values[0])
            spec.sigma = float(values[1])

        else:
            spec.normal_distributed = False
            spec.low = float(values[0])
            spec.high = float(values[1])
            if spec.q is not None:
                spec.clip = (spec.low, spec.high)
            if spec.log_distributed:
                # make it align with mu
                spec.low = math.log(spec.low)
                spec.high = math.log(spec.high)

    return ParameterSpec(**spec.__dict__)

def _is_nested_choices(values):
    assert values  # choices should not be empty
    for value in values:
        if not isinstance(value, dict):
            return False
        if '_name' not in value:
            return False
    return True

def _assign(params, key, x):
    if len(key) == 1:
        params[key[0]] = x
    elif isinstance(key[0], int):
        _assign(params, key[1:], x)
    else:
        if key[0] not in params:
            params[key[0]] = {}
        _assign(params[key[0]], key[1:], x)