formatting.py 7.88 KB
Newer Older
1
2
3
4
5
6
7
8
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
This script provides a more program-friendly representation of HPO search space.
The format is considered internal helper and is not visible to end users.

You will find this useful when you want to support nested search space.
9
10
11

The random tuner is an intuitive example for this utility.
You should check its code before reading docstrings in this file.
12
13
14
15
16
"""

__all__ = [
    'ParameterSpec',
    'deformat_parameters',
17
    'format_parameters',
18
19
20
21
    'format_search_space',
]

import math
22
from types import SimpleNamespace
23
24
from typing import Any, List, NamedTuple, Optional, Tuple

25
26
import numpy as np

27
28
class ParameterSpec(NamedTuple):
    """
29
30
31
    Specification (aka space / range / domain) of one single parameter.

    NOTE: For `loguniform` (and `qloguniform`), the fields `low` and `high` are logarithm of original values.
32
33
34
35
36
37
38
39
40
    """

    name: str                       # The object key in JSON
    type: str                       # "_type" in JSON
    values: List[Any]               # "_value" in JSON

    key: Tuple[str]                 # The "path" of this parameter

    categorical: bool               # Whether this paramter is categorical (unordered) or numerical (ordered)
41
    size: int = None                # If it's categorical, how many candidates it has
42
43
44
45
46
47

    # uniform distributed
    low: float = None               # Lower bound of uniform parameter
    high: float = None              # Upper bound of uniform parameter

    normal_distributed: bool = None # Whether this parameter is uniform or normal distrubuted
48
49
50
51
52
53
    mu: float = None                # µ of normal parameter
    sigma: float = None             # σ of normal parameter

    q: Optional[float] = None       # If not `None`, the parameter value should be an integer multiple of this
    clip: Optional[Tuple[float, float]] = None
                                    # For q(log)uniform, this equals to "values[:2]"; for others this is None
54
55

    log_distributed: bool = None    # Whether this parameter is log distributed
56
                                    # When true, low/high/mu/sigma describes log of parameter value (like np.lognormal)
57

58
    def is_activated_in(self, partial_parameters):
59
60
        """
        For nested search space, check whether this parameter should be skipped for current set of paremters.
61
        This function must be used in a pattern similar to random tuner. Otherwise it will misbehave.
62
        """
63
64
65
        if len(self.key) < 2 or isinstance(self.key[-2], str):
            return True
        return partial_parameters[self.key[:-2]] == self.key[-2]
66

67
68
69
70
71
72
73
74
75
76
def format_search_space(search_space):
    """
    Convert user provided search space into a dict of ParameterSpec.
    The dict key is dict value's `ParameterSpec.key`.
    """
    formatted = _format_search_space(tuple(), search_space)
    # In CPython 3.6, dicts preserve order by internal implementation.
    # In Python 3.7+, dicts preserve order by language spec.
    # Python 3.6 is crappy enough. Don't bother to do extra work for it.
    # Remove these comments when we drop 3.6 support.
77
78
    return {spec.key: spec for spec in formatted}

79
def deformat_parameters(formatted_parameters, formatted_search_space):
80
    """
81
82
83
84
85
86
87
88
89
    Convert internal format parameters to users' expected format.

    "test/ut/sdk/test_hpo_formatting.py" provides examples of how this works.

    The function do following jobs:
     1. For "choice" and "randint", convert index (integer) to corresponding value.
     2. For "*log*", convert x to `exp(x)`.
     3. For "q*", convert x to `round(x / q) * q`, then clip into range.
     4. For nested choices, convert flatten key-value pairs into nested structure.
90
91
    """
    ret = {}
92
    for key, x in formatted_parameters.items():
93
        spec = formatted_search_space[key]
94
95
96
        if spec.categorical:
            if spec.type == 'randint':
                lower = min(math.ceil(float(x)) for x in spec.values)
97
                _assign(ret, key, int(lower + x))
98
99
100
101
            elif _is_nested_choices(spec.values):
                _assign(ret, tuple([*key, '_name']), spec.values[x]['_name'])
            else:
                _assign(ret, key, spec.values[x])
102
        else:
103
104
105
106
107
108
109
            if spec.log_distributed:
                x = math.exp(x)
            if spec.q is not None:
                x = round(x / spec.q) * spec.q
            if spec.clip:
                x = max(x, spec.clip[0])
                x = min(x, spec.clip[1])
110
111
            if isinstance(x, np.number):
                x = x.item()
112
            _assign(ret, key, x)
113
114
    return ret

115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def format_parameters(parameters, formatted_search_space):
    """
    Convert end users' parameter format back to internal format, mainly for resuming experiments.

    The result is not accurate for "q*" and for "choice" that have duplicate candidates.
    """
    # I don't like this function. It's better to use checkpoint for resuming.
    ret = {}
    for key, spec in formatted_search_space.items():
        if not spec.is_activated_in(ret):
            continue
        value = parameters
        for name in key:
            if isinstance(name, str):
                value = value[name]
        if spec.categorical:
            if spec.type == 'randint':
                lower = min(math.ceil(float(x)) for x in spec.values)
                ret[key] = value - lower
            elif _is_nested_choices(spec.values):
                names = [nested['_name'] for nested in spec.values]
                ret[key] = names.index(value['_name'])
            else:
                ret[key] = spec.values.index(value)
        else:
            if spec.log_distributed:
                value = math.log(value)
            ret[key] = value
    return ret

145
def _format_search_space(parent_key, space):
146
147
148
149
150
    formatted = []
    for name, spec in space.items():
        if name == '_name':
            continue
        key = tuple([*parent_key, name])
151
        formatted.append(_format_parameter(key, spec['_type'], spec['_value']))
152
153
        if spec['_type'] == 'choice' and _is_nested_choices(spec['_value']):
            for index, sub_space in enumerate(spec['_value']):
154
155
                key = tuple([*parent_key, name, index])
                formatted += _format_search_space(key, sub_space)
156
157
    return formatted

158
159
160
161
162
163
164
165
def _format_parameter(key, type_, values):
    spec = SimpleNamespace(
        name = key[-1],
        type = type_,
        values = values,
        key = key,
        categorical = type_ in ['choice', 'randint'],
    )
166

167
    if spec.categorical:
168
        if type_ == 'choice':
169
            spec.size = len(values)
170
        else:
171
172
173
            lower = math.ceil(float(values[0]))
            upper = math.ceil(float(values[1]))
            spec.size = upper - lower
174
175
176

    else:
        if type_.startswith('q'):
177
178
179
180
            spec.q = float(values[2])
        else:
            spec.q = None
        spec.log_distributed = ('log' in type_)
181
182

        if 'normal' in type_:
183
184
185
            spec.normal_distributed = True
            spec.mu = float(values[0])
            spec.sigma = float(values[1])
186
187

        else:
188
189
190
191
192
193
194
195
196
197
198
            spec.normal_distributed = False
            spec.low = float(values[0])
            spec.high = float(values[1])
            if spec.q is not None:
                spec.clip = (spec.low, spec.high)
            if spec.log_distributed:
                # make it align with mu
                spec.low = math.log(spec.low)
                spec.high = math.log(spec.high)

    return ParameterSpec(**spec.__dict__)
199
200

def _is_nested_choices(values):
201
    assert values  # choices should not be empty
202
203
204
205
206
207
208
209
210
211
    for value in values:
        if not isinstance(value, dict):
            return False
        if '_name' not in value:
            return False
    return True

def _assign(params, key, x):
    if len(key) == 1:
        params[key[0]] = x
212
213
    elif isinstance(key[0], int):
        _assign(params, key[1:], x)
214
215
216
217
    else:
        if key[0] not in params:
            params[key[0]] = {}
        _assign(params[key[0]], key[1:], x)