run_distillation.py 5.33 KB
Newer Older
Chen Chen's avatar
Chen Chen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=line-too-long
"""Creating the task and start trainer."""

import pprint

from absl import app
from absl import flags
from absl import logging
import gin
from official.common import distribute_utils
from official.common import flags as tfm_flags
from official.core import config_definitions as cfg
from official.core import train_utils
from official.modeling import hyperparams
from official.modeling import optimization
from official.modeling import performance
from official.modeling.progressive import train_lib
from official.modeling.progressive import trainer as prog_trainer_lib
from official.nlp.data import pretrain_dataloader
from official.nlp.projects.mobilebert import distillation


FLAGS = flags.FLAGS

optimization_config = optimization.OptimizationConfig(
    optimizer=optimization.OptimizerConfig(
        type='lamb',
        lamb=optimization.LAMBConfig(
            weight_decay_rate=0.01,
            exclude_from_weight_decay=['LayerNorm', 'bias', 'norm'],
            clipnorm=1.0)),
    learning_rate=optimization.LrConfig(
        type='polynomial',
        polynomial=optimization.PolynomialLrConfig(
            initial_learning_rate=1.5e-3,
            decay_steps=10000,
            end_learning_rate=1.5e-3)),
    warmup=optimization.WarmupConfig(
        type='linear',
        linear=optimization.LinearWarmupConfig(warmup_learning_rate=0)))


# copy from progressive/utils.py due to the private visibility issue.
def config_override(params, flags_obj):
  """Override ExperimentConfig according to flags."""
  # Change runtime.tpu to the real tpu.
  params.override({
      'runtime': {
          'tpu': flags_obj.tpu,
      }
  })

  # Get the first level of override from `--config_file`.
  #   `--config_file` is typically used as a template that specifies the common
  #   override for a particular experiment.
  for config_file in flags_obj.config_file or []:
    params = hyperparams.override_params_dict(
        params, config_file, is_strict=True)

  # Get the second level of override from `--params_override`.
  #   `--params_override` is typically used as a further override over the
  #   template. For example, one may define a particular template for training
  #   ResNet50 on ImageNet in a config file and pass it via `--config_file`,
  #   then define different learning rates and pass it via `--params_override`.
  if flags_obj.params_override:
    params = hyperparams.override_params_dict(
        params, flags_obj.params_override, is_strict=True)

  params.validate()
  params.lock()

  pp = pprint.PrettyPrinter()
  logging.info('Final experiment parameters: %s', pp.pformat(params.as_dict()))

  model_dir = flags_obj.model_dir
  if 'train' in flags_obj.mode:
    # Pure eval modes do not output yaml files. Otherwise continuous eval job
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

  return params


def get_exp_config():
  """Get ExperimentConfig."""
  params = cfg.ExperimentConfig(
      task=distillation.BertDistillationTaskConfig(
          train_data=pretrain_dataloader.BertPretrainDataConfig(),
          validation_data=pretrain_dataloader.BertPretrainDataConfig(
              is_training=False)),
      trainer=prog_trainer_lib.ProgressiveTrainerConfig(
          progressive=distillation.BertDistillationProgressiveConfig(),
          optimizer_config=optimization_config,
          train_steps=740000,
          checkpoint_interval=20000))

  return config_override(params, FLAGS)


def main(_):
  logging.info('Parsing config files...')
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = get_exp_config()

  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
124
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
Chen Chen's avatar
Chen Chen committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  with distribution_strategy.scope():
    task = distillation.BertDistillationTask(
        strategy=distribution_strategy,
        progressive=params.trainer.progressive,
        optimizer_config=params.trainer.optimizer_config,
        task_config=params.task)

  train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      model_dir=FLAGS.model_dir)

if __name__ == '__main__':
  tfm_flags.define_flags()
  app.run(main)