spec.proto 10.1 KB
Newer Older
Ivan Bogatyy's avatar
Ivan Bogatyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// DRAGNN Configuration proto. See go/dragnn-design for more information.

syntax = "proto2";

package syntaxnet.dragnn;

// Proto to specify a set of DRAGNN components (transition systems) that are
// trained and evaluated jointly. Each component gets one ComponentSpec.
//
// The order of component is important: a component can only link to components
// that come before (for now.)
// NEXT ID: 6
message MasterSpec {
  repeated ComponentSpec component = 1;

  // DEPRECATED: Use the "batch_size" param of DragnnTensorFlowTrainer instead.
  optional int32 deprecated_batch_size = 2 [default = 1, deprecated = true];

  // DEPRECATED: Use ComponentSpec.*_beam_size instead.
  optional int32 deprecated_beam_size = 3 [default = 1, deprecated = true];

  // Whether to extract debug traces.
  optional bool debug_tracing = 4 [default = false];
}

// Complete specification for a single task.
message ComponentSpec {
  // Name for this component: this is used in linked features via the
  // "source_component" field.
  optional string name = 1;

  // TransitionSystem to use.
  optional RegisteredModuleSpec transition_system = 2;

  // Resources that this component depends on. These are copied to TaskInputs
  // when calling SAFT code.
  repeated Resource resource = 3;

  // Feature space configurations.
  repeated FixedFeatureChannel fixed_feature = 4;
  repeated LinkedFeatureChannel linked_feature = 5;

  // Neural Network builder specification.
  optional RegisteredModuleSpec network_unit = 6;

  // The registered C++ implementation of the dragnn::Component class; e.g.
  // "SyntaxNetComponent".
  optional RegisteredModuleSpec backend = 7;

  // Number of possible actions from every state.
  optional int32 num_actions = 8;

  // Specify the name of the lower level component on which it has attention.
  optional string attention_component = 9 [default = ""];

  // Options for the ComponentBuilder. If this is empty, the regular
  // tf.while_loop based builder is assumed.
  optional RegisteredModuleSpec component_builder = 10;

  // Default max number of active states for beam training.
  optional int32 training_beam_size = 11 [default = 1];

  // Default max number of active states for beam inference.
  optional int32 inference_beam_size = 12 [default = 1];
}

// Super generic container for any registered sub-piece of DRAGNN.
message RegisteredModuleSpec {
  // Name of the registered class.
  optional string registered_name = 1;

  // Parameters to set while initializing this system; these are copied to
  // Parameters in a TaskSpec when calling SAFT code, or via kwargs in TF Python
  // code.
  map<string, string> parameters = 2;
}

// Fixed resources that will be converted into TaskInput's when calling SAFT
// code.
message Resource {
  optional string name = 1;
  repeated Part part = 2;
}

// The Parts here should be more or less compatible with TaskInput.
message Part {
  optional string file_pattern = 1;
  optional string file_format = 2;
  optional string record_format = 3;
}

// ------------------------------------------------------------------------
// Feature specifications.
//
// A *feature channel* is a named collection of feature templates that share an
// embedding matrix. Thus all features in the channel are assumed to use the
// same vocabulary: e.g., words, POS tags, hidden layer activations, etc. These
// are extracted, embedded, and then concatenated together as a group.

// Specification for a feature channel that is a *fixed* function of the input.
// NEXT_ID: 10
message FixedFeatureChannel {
  // Interpretable name for this feature channel. NN builders might depend on
  // this to determine how to hook different channels up internally.
  optional string name = 1;

  // String describing the FML for this feature channel.
  optional string fml = 2;

  // Size of parameters for this space:

  // Dimensions of embedding space, or -1 if the feature should not be embedded.
  optional int32 embedding_dim = 3;

  // No. of possible values returned.
  optional int32 vocabulary_size = 4;

  // No. of different feature templates in the channel, i.e. the # of features
  // that will be concatenated but share the embedding for this channel.
  optional int32 size = 5;

  // Whether the embeddings for this channel should be held constant at their
  // pretrained values, instead of being trained.  Pretrained embeddings are
  // required when true.
  optional bool is_constant = 9;

  // Resources for this space:

  // Predicate map for compacting feature values.
  optional string predicate_map = 6;

  // Pointer to a pretrained embedding matrix for this feature set.
  optional Resource pretrained_embedding_matrix = 7;

  // Vocab file, containing all vocabulary words one per line.
  optional Resource vocab = 8;
}

// Specification for a feature channel that *links* to component
// activations. Note that the "vocabulary" of these features is the activations
// that they are linked to, so it is determined by the other components in the
// spec.
message LinkedFeatureChannel {
  // Interpretable name for this feature channel. NN builders might depend on
  // this to determine how to hook different channels up internally.
  optional string name = 1;

  // Feature function specification. Note: these should all be of type
  // LinkedFeatureType.
  optional string fml = 2;

  // Embedding dimension, or -1 if the link should not be embedded.
  optional int32 embedding_dim = 3;

  // No. of different feature templates in the channel, i.e. the # of features
  // that will be concatenated but share the embedding for this channel.
  optional int32 size = 4;

  // Component to use for translation, e.g. "tagger"
  optional string source_component = 5;

  // Translator target, e.g. "token" or "last_action", to translate raw feature
  // values into indices. This must be interpretable by the Component referenced
  // by source_component.
  optional string source_translator = 6;

  // Layer that these features should connect to.
  optional string source_layer = 7;
}

// A vector of hyperparameter configurations to search over.
message TrainingGridSpec {
  // Grid points to search over.
  repeated GridPoint grid_point = 1;

  // Training targets to create in the graph builder stage.
  repeated TrainTarget target = 2;
}

// A hyperparameter configuration for a training run.
// NEXT ID: 22
message GridPoint {
  // Global learning rate initialization point.
  optional double learning_rate = 1 [default = 0.1];

  // Momentum coefficient when using MomentumOptimizer.
  optional double momentum = 2 [default = 0.9];

  // Decay rate and base for global learning rate decay.  The learning rate is
  // reduced by a factor of |decay_base| every |decay_steps|.
  optional double decay_base = 16 [default = 0.96];
  optional int32 decay_steps = 3 [default = 1000];

  // Whether to decay the learning rate in a "staircase" manner.  If true, the
  // rate is adjusted exactly once every |decay_steps|.  Otherwise, the rate is
  // adjusted in smaller increments on every step, such that the overall rate of
  // decay is still |decay_base| every |decay_steps|.
  optional bool decay_staircase = 17 [default = true];

  // Random seed to initialize parameters.
  optional int32 seed = 4 [default = 0];

  // Specify the optimizer used in training, the default is MomentumOptimizer.
  optional string learning_method = 7 [default = 'momentum'];

  // Whether or not to use a moving average of the weights in inference time.
  optional bool use_moving_average = 8 [default = false];

  // Rolling average update co-efficient.
  optional double average_weight = 9 [default = 0.9999];

  // The dropout *keep* probability rate used in the model. 1.0 = no dropout.
  optional double dropout_rate = 10 [default = 1.0];

  // The dropout *keep* probability rate for recurrent connections.  If < 0.0,
  // recurrent connections should use |dropout_rate| instead.  1.0 = no dropout.
  optional double recurrent_dropout_rate = 20 [default = -1.0];

  // Gradient clipping threshold, applied if greater than zero. A value in the
  // range 1-20 seems to work well to prevent large learning rates from causing
  // problems for updates at the start of training.
  optional double gradient_clip_norm = 11 [default = 0.0];

  // DEPRECATED: Use TrainTarget instead.
  repeated double component_weights = 5;
  repeated bool unroll_using_oracle = 6;

  // A spec for using multiple optimization methods.
  message CompositeOptimizerSpec {
    // First optimizer.
    optional GridPoint method1 = 1;

    // Second optimizer.
    optional GridPoint method2 = 2;

    // After this number of steps, switch from first to second.
    optional int32 switch_after_steps = 3;
  }
  optional CompositeOptimizerSpec composite_optimizer_spec = 12;

  // Parameters for Adam training.
  optional double adam_beta1 = 13 [default = 0.01];
  optional double adam_beta2 = 14 [default = 0.9999];
  optional double adam_eps = 15 [default = 1e-8];

  // Coefficient for global L2 regularization.
  optional double l2_regularization_coefficient = 18 [default = 1e-4];

  // Coefficient for global self normalization regularization.
  // A value of zero turns it off.
  optional double self_norm_alpha = 19 [default = 0.0];

  // Comma separated list of components to which self_norm_alpha
  // should be restricted. If left empty, no filtering will take
  // place. Typically a single component.
  optional string self_norm_components_filter = 21;
}

// Training target to be built into the graph.
message TrainTarget {
  // Name for this target. This should be unique across all targets.
  optional string name = 1;

  // Specify the weights for different components. This should be the same size
  // as the number of components in the spec, or empty (defaults to equal
  // weights). Weights are normalized across the components being trained to sum
  // to one.
  repeated double component_weights = 2;

  // Specify whether to train a component using supervised signal or not. This
  // should be the same size as the number of components in the spec, or empty
  // (defaults to all true).
  repeated bool unroll_using_oracle = 3;

  // Maximum length of the pipeline to train. E.g. if max_index is 1, then only
  // the first component will be trained via this target.
  optional int32 max_index = 4 [default = -1];
}