sentence_prediction_test.py 10.1 KB
Newer Older
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
1
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
2
3
4
5
6
7
8
9
10
11
12
13
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frederick Liu's avatar
Frederick Liu committed
14

15
"""Tests for official.nlp.tasks.sentence_prediction."""
16
import functools
17
import os
18
19

from absl.testing import parameterized
20
import numpy as np
21
22
23
24
import tensorflow as tf

from official.nlp.configs import bert
from official.nlp.configs import encoders
Chen Chen's avatar
Chen Chen committed
25
from official.nlp.data import sentence_prediction_dataloader
Hongkun Yu's avatar
Hongkun Yu committed
26
from official.nlp.tasks import masked_lm
27
28
29
from official.nlp.tasks import sentence_prediction


30
31
32
33
34
def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
  """Creates a fake dataset."""
  writer = tf.io.TFRecordWriter(output_path)

  def create_int_feature(values):
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
35
36
    return tf.train.Feature(
        int64_list=tf.train.Int64List(value=np.ravel(values)))
37
38

  def create_float_feature(values):
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
39
40
    return tf.train.Feature(
        float_list=tf.train.FloatList(value=np.ravel(values)))
41

Chen Chen's avatar
Chen Chen committed
42
  for i in range(num_examples):
43
44
45
46
47
48
    features = {}
    input_ids = np.random.randint(100, size=(seq_length))
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
Chen Chen's avatar
Chen Chen committed
49
    features["example_id"] = create_int_feature([i])
50
51
52
53
54
55
56
57
58
59
60
61

    if num_classes == 1:
      features["label_ids"] = create_float_feature([np.random.random()])
    else:
      features["label_ids"] = create_int_feature(
          [np.random.random_integers(0, num_classes - 1, size=())])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()


62
class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
63

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
64
65
  def setUp(self):
    super(SentencePredictionTaskTest, self).setUp()
Chen Chen's avatar
Chen Chen committed
66
67
68
    self._train_data_config = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path="dummy", seq_length=128, global_batch_size=1))
69

Pengchong Jin's avatar
Pengchong Jin committed
70
  def get_model_config(self, num_classes):
Hongkun Yu's avatar
Hongkun Yu committed
71
    return sentence_prediction.ModelConfig(
Hongkun Yu's avatar
Hongkun Yu committed
72
73
        encoder=encoders.EncoderConfig(
            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
Hongkun Yu's avatar
Hongkun Yu committed
74
        num_classes=num_classes)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
75

76
77
78
79
80
81
  def _run_task(self, config):
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
    metrics = task.build_metrics()

    strategy = tf.distribute.get_strategy()
Chenkai Kuang's avatar
Chenkai Kuang committed
82
    dataset = strategy.distribute_datasets_from_function(
83
        functools.partial(task.build_inputs, config.train_data))
84
85
86
87

    iterator = iter(dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
Chen Chen's avatar
Chen Chen committed
88
    model.save(os.path.join(self.get_temp_dir(), "saved_model"))
89
    return task.validation_step(next(iterator), model, metrics=metrics)
90

Hongkun Yu's avatar
Hongkun Yu committed
91
92
93
94
95
  @parameterized.named_parameters(
      ("init_cls_pooler", True),
      ("init_encoder", False),
  )
  def test_task(self, init_cls_pooler):
Hongkun Yu's avatar
Hongkun Yu committed
96
    # Saves a checkpoint.
Hongkun Yu's avatar
Hongkun Yu committed
97
98
99
    pretrain_cfg = bert.PretrainerConfig(
        encoder=encoders.EncoderConfig(
            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
Hongkun Yu's avatar
Hongkun Yu committed
100
101
        cls_heads=[
            bert.ClsHeadConfig(
Hongkun Yu's avatar
Hongkun Yu committed
102
                inner_dim=768, num_classes=2, name="next_sentence")
Hongkun Yu's avatar
Hongkun Yu committed
103
        ])
Hongkun Yu's avatar
Hongkun Yu committed
104
    pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg)
Hongkun Yu's avatar
Hongkun Yu committed
105
106
    # The model variables will be created after the forward call.
    _ = pretrain_model(pretrain_model.inputs)
Hongkun Yu's avatar
Hongkun Yu committed
107
108
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
Hongkun Yu's avatar
Hongkun Yu committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    init_path = ckpt.save(self.get_temp_dir())

    # Creates the task.
    config = sentence_prediction.SentencePredictionConfig(
        init_checkpoint=init_path,
        model=self.get_model_config(num_classes=2),
        train_data=self._train_data_config,
        init_cls_pooler=init_cls_pooler)
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
    dataset = task.build_inputs(config.train_data)

    iterator = iter(dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
Hongkun Yu's avatar
Hongkun Yu committed
124
    task.initialize(model)
Hongkun Yu's avatar
Hongkun Yu committed
125
126
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)
Hongkun Yu's avatar
Hongkun Yu committed
127

A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  @parameterized.named_parameters(
      {
          "testcase_name": "regression",
          "num_classes": 1,
      },
      {
          "testcase_name": "classification",
          "num_classes": 2,
      },
  )
  def test_metrics_and_losses(self, num_classes):
    config = sentence_prediction.SentencePredictionConfig(
        init_checkpoint=self.get_temp_dir(),
        model=self.get_model_config(num_classes),
        train_data=self._train_data_config)
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
    if num_classes == 1:
      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
    else:
Hongkun Yu's avatar
Hongkun Yu committed
149
150
      self.assertIsInstance(metrics[0],
                            tf.keras.metrics.SparseCategoricalAccuracy)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
151
152
153
154
155
156
157
158
159

    dataset = task.build_inputs(config.train_data)
    iterator = iter(dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
    task.train_step(next(iterator), model, optimizer, metrics=metrics)

    logs = task.validation_step(next(iterator), model, metrics=metrics)
    loss = logs["loss"].numpy()
    if num_classes == 1:
160
      self.assertGreater(loss, 1.0)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
161
    else:
162
      self.assertLess(loss, 1.0)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
163

164
165
166
  @parameterized.parameters(("matthews_corrcoef", 2),
                            ("pearson_spearman_corr", 1))
  def test_np_metrics(self, metric_type, num_classes):
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
167
    config = sentence_prediction.SentencePredictionConfig(
168
169
        metric_type=metric_type,
        init_checkpoint=self.get_temp_dir(),
Pengchong Jin's avatar
Pengchong Jin committed
170
        model=self.get_model_config(num_classes),
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
171
172
173
        train_data=self._train_data_config)
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
174
175
176
177
178
179
180
181
182
183
184
185
186
    dataset = task.build_inputs(config.train_data)

    iterator = iter(dataset)
    strategy = tf.distribute.get_strategy()
    distributed_outputs = strategy.run(
        functools.partial(task.validation_step, model=model),
        args=(next(iterator),))
    outputs = tf.nest.map_structure(strategy.experimental_local_results,
                                    distributed_outputs)
    aggregated = task.aggregate_logs(step_outputs=outputs)
    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
    self.assertIn(metric_type, task.reduce_aggregated_logs(aggregated))

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
  def test_np_metrics_cola_partial_batch(self):
    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
    num_examples = 5
    global_batch_size = 8
    seq_length = 16
    _create_fake_dataset(
        train_data_path,
        seq_length=seq_length,
        num_classes=2,
        num_examples=num_examples)

    train_data_config = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path=train_data_path,
            seq_length=seq_length,
            is_training=True,
            label_type="int",
            global_batch_size=global_batch_size,
            drop_remainder=False,
            include_example_id=True))

    config = sentence_prediction.SentencePredictionConfig(
        metric_type="matthews_corrcoef",
        model=self.get_model_config(2),
        train_data=train_data_config)
    outputs = self._run_task(config)
    self.assertEqual(outputs["sentence_prediction"].shape.as_list(), [8, 1])

215
  def _export_bert_tfhub(self):
216
217
218
219
220
221
222
    encoder = encoders.build_encoder(
        encoders.EncoderConfig(
            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)))
    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
    encoder_output_dict = encoder(encoder_inputs_dict)
    core_model = tf.keras.Model(
        inputs=encoder_inputs_dict, outputs=encoder_output_dict)
223
    hub_destination = os.path.join(self.get_temp_dir(), "hub")
224
    core_model.save(hub_destination, include_optimizer=False, save_format="tf")
225
226
227
228
229
230
    return hub_destination

  def test_task_with_hub(self):
    hub_module_url = self._export_bert_tfhub()
    config = sentence_prediction.SentencePredictionConfig(
        hub_module_url=hub_module_url,
Pengchong Jin's avatar
Pengchong Jin committed
231
        model=self.get_model_config(2),
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
232
        train_data=self._train_data_config)
233
234
    self._run_task(config)

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
  @parameterized.named_parameters(("classification", 5), ("regression", 1))
  def test_prediction(self, num_classes):
    task_config = sentence_prediction.SentencePredictionConfig(
        model=self.get_model_config(num_classes=num_classes),
        train_data=self._train_data_config)
    task = sentence_prediction.SentencePredictionTask(task_config)
    model = task.build_model()

    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
    seq_length = 16
    num_examples = 100
    _create_fake_dataset(
        test_data_path,
        seq_length=seq_length,
        num_classes=num_classes,
        num_examples=num_examples)

    test_data_config = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path=test_data_path,
            seq_length=seq_length,
            is_training=False,
            label_type="int" if num_classes > 1 else "float",
            global_batch_size=16,
Chen Chen's avatar
Chen Chen committed
259
260
            drop_remainder=False,
            include_example_id=True))
261
262
263

    predictions = sentence_prediction.predict(task, test_data_config, model)
    self.assertLen(predictions, num_examples)
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
264
265
266
    for prediction in predictions:
      self.assertEqual(prediction.dtype,
                       tf.int64 if num_classes > 1 else tf.float32)
267

268
269
270

if __name__ == "__main__":
  tf.test.main()