"git@developer.sourcefind.cn:change/sglang.git" did not exist on "0b24af4d7984c165adba4032bcd7fd62630b5c48"
Commit d4f5c193 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 317691679
parent 1357ce19
...@@ -130,3 +130,22 @@ class QADevDataConfig(cfg.DataConfig): ...@@ -130,3 +130,22 @@ class QADevDataConfig(cfg.DataConfig):
is_training: bool = False is_training: bool = False
seq_length: int = 384 seq_length: int = 384
drop_remainder: bool = False drop_remainder: bool = False
@dataclasses.dataclass
class TaggingDataConfig(cfg.DataConfig):
"""Data config for tagging (tasks/tagging)."""
input_path: str = ""
global_batch_size: int = 48
is_training: bool = True
seq_length: int = 384
@dataclasses.dataclass
class TaggingDevDataConfig(cfg.DataConfig):
"""Dev Data config for tagging (tasks/tagging)."""
input_path: str = ""
global_batch_size: int = 48
is_training: bool = False
seq_length: int = 384
drop_remainder: bool = False
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the tagging (e.g., NER/POS) task."""
from typing import Mapping, Optional
import tensorflow as tf
from official.core import input_reader
class TaggingDataLoader:
"""A class to load dataset for tagging (e.g., NER and POS) task."""
def __init__(self, params):
self._params = params
self._seq_length = params.seq_length
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
name_to_features = {
'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
}
example = tf.io.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in example:
t = example[name]
if t.dtype == tf.int64:
t = tf.cast(t, tf.int32)
example[name] = t
return example
def _parse(self, record: Mapping[str, tf.Tensor]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x = {
'input_word_ids': record['input_ids'],
'input_mask': record['input_mask'],
'input_type_ids': record['segment_ids']
}
y = record['label_ids']
return (x, y)
def load(self, input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a tf.dataset.Dataset."""
reader = input_reader.InputReader(
params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
return reader.read(input_context)
...@@ -55,6 +55,7 @@ class BertTokenClassifier(tf.keras.Model): ...@@ -55,6 +55,7 @@ class BertTokenClassifier(tf.keras.Model):
dropout_rate=0.1, dropout_rate=0.1,
**kwargs): **kwargs):
self._self_setattr_tracking = False self._self_setattr_tracking = False
self._network = network
self._config = { self._config = {
'network': network, 'network': network,
'num_classes': num_classes, 'num_classes': num_classes,
...@@ -84,6 +85,10 @@ class BertTokenClassifier(tf.keras.Model): ...@@ -84,6 +85,10 @@ class BertTokenClassifier(tf.keras.Model):
super(BertTokenClassifier, self).__init__( super(BertTokenClassifier, self).__init__(
inputs=inputs, outputs=predictions, **kwargs) inputs=inputs, outputs=predictions, **kwargs)
@property
def checkpoint_items(self):
return dict(encoder=self._network)
def get_config(self): def get_config(self):
return self._config return self._config
......
...@@ -24,6 +24,7 @@ from official.modeling.hyperparams import config_definitions as cfg ...@@ -24,6 +24,7 @@ from official.modeling.hyperparams import config_definitions as cfg
from official.nlp.bert import input_pipeline from official.nlp.bert import input_pipeline
from official.nlp.configs import encoders from official.nlp.configs import encoders
from official.nlp.modeling import models from official.nlp.modeling import models
from official.nlp.tasks import utils
@dataclasses.dataclass @dataclasses.dataclass
...@@ -57,19 +58,7 @@ class QuestionAnsweringTask(base_task.Task): ...@@ -57,19 +58,7 @@ class QuestionAnsweringTask(base_task.Task):
def build_model(self): def build_model(self):
if self._hub_module: if self._hub_module:
# TODO(lehou): maybe add the hub_module building logic to a util function. encoder_network = utils.get_encoder_from_hub(self._hub_module)
input_word_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
bert_model = hub.KerasLayer(self._hub_module, trainable=True)
pooled_output, sequence_output = bert_model(
[input_word_ids, input_mask, input_type_ids])
encoder_network = tf.keras.Model(
inputs=[input_word_ids, input_mask, input_type_ids],
outputs=[sequence_output, pooled_output])
else: else:
encoder_network = encoders.instantiate_encoder_from_cfg( encoder_network = encoders.instantiate_encoder_from_cfg(
self.task_config.network) self.task_config.network)
......
...@@ -27,6 +27,7 @@ from official.modeling.hyperparams import config_definitions as cfg ...@@ -27,6 +27,7 @@ from official.modeling.hyperparams import config_definitions as cfg
from official.nlp.configs import bert from official.nlp.configs import bert
from official.nlp.data import sentence_prediction_dataloader from official.nlp.data import sentence_prediction_dataloader
from official.nlp.modeling import losses as loss_lib from official.nlp.modeling import losses as loss_lib
from official.nlp.tasks import utils
@dataclasses.dataclass @dataclasses.dataclass
...@@ -67,18 +68,7 @@ class SentencePredictionTask(base_task.Task): ...@@ -67,18 +68,7 @@ class SentencePredictionTask(base_task.Task):
def build_model(self): def build_model(self):
if self._hub_module: if self._hub_module:
input_word_ids = tf.keras.layers.Input( encoder_from_hub = utils.get_encoder_from_hub(self._hub_module)
shape=(None,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
bert_model = hub.KerasLayer(self._hub_module, trainable=True)
pooled_output, sequence_output = bert_model(
[input_word_ids, input_mask, input_type_ids])
encoder_from_hub = tf.keras.Model(
inputs=[input_word_ids, input_mask, input_type_ids],
outputs=[sequence_output, pooled_output])
return bert.instantiate_bertpretrainer_from_cfg( return bert.instantiate_bertpretrainer_from_cfg(
self.task_config.network, encoder_network=encoder_from_hub) self.task_config.network, encoder_network=encoder_from_hub)
else: else:
......
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tagging (e.g., NER/POS) task."""
import logging
import dataclasses
import tensorflow as tf
import tensorflow_hub as hub
from official.core import base_task
from official.modeling.hyperparams import config_definitions as cfg
from official.nlp.configs import encoders
from official.nlp.data import tagging_data_loader
from official.nlp.modeling import models
from official.nlp.tasks import utils
@dataclasses.dataclass
class TaggingConfig(cfg.TaskConfig):
"""The model config."""
# At most one of `init_checkpoint` and `hub_module_url` can be specified.
init_checkpoint: str = ''
hub_module_url: str = ''
network: encoders.TransformerEncoderConfig = (
encoders.TransformerEncoderConfig())
num_classes: int = 0
# The ignored label id will not contribute to loss.
# A word may be tokenized into multiple word_pieces tokens, and we usually
# assign the real label id for the first token of the word, and
# `ignore_label_id` for the remaining tokens.
ignore_label_id: int = 0
train_data: cfg.DataConfig = cfg.DataConfig()
validation_data: cfg.DataConfig = cfg.DataConfig()
@base_task.register_task_cls(TaggingConfig)
class TaggingTask(base_task.Task):
"""Task object for tagging (e.g., NER or POS)."""
def __init__(self, params=cfg.TaskConfig):
super(TaggingTask, self).__init__(params)
if params.hub_module_url and params.init_checkpoint:
raise ValueError('At most one of `hub_module_url` and '
'`init_checkpoint` can be specified.')
if params.num_classes == 0:
raise ValueError('TaggingConfig.num_classes cannot be 0.')
if params.hub_module_url:
self._hub_module = hub.load(params.hub_module_url)
else:
self._hub_module = None
def build_model(self):
if self._hub_module:
encoder_network = utils.get_encoder_from_hub(self._hub_module)
else:
encoder_network = encoders.instantiate_encoder_from_cfg(
self.task_config.network)
return models.BertTokenClassifier(
network=encoder_network,
num_classes=self.task_config.num_classes,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=self.task_config.network.initializer_range),
dropout_rate=self.task_config.network.dropout_rate,
output='logits')
def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
model_outputs = tf.cast(model_outputs, tf.float32)
loss = tf.keras.losses.sparse_categorical_crossentropy(
labels, model_outputs, from_logits=True)
# `ignore_label_id` will not contribute to loss.
label_weights = tf.cast(
tf.not_equal(labels, self.task_config.ignore_label_id),
dtype=tf.float32)
numerator_loss = tf.reduce_sum(loss * label_weights)
denominator_loss = tf.reduce_sum(label_weights)
loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
return loss
def build_inputs(self, params, input_context=None):
"""Returns tf.data.Dataset for sentence_prediction task."""
if params.input_path == 'dummy':
def dummy_data(_):
dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
x = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids)
y = tf.ones((1, params.seq_length), dtype=tf.int32)
return (x, y)
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset
dataset = tagging_data_loader.TaggingDataLoader(params).load(input_context)
return dataset
def build_metrics(self, training=None):
del training
# TODO(chendouble): evaluate using seqeval's f1/precision/recall.
return [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
def process_metrics(self, metrics, labels, model_outputs):
# `ignore_label_id` will not contribute to metrics.
sample_weight = tf.cast(
tf.not_equal(labels, self.task_config.ignore_label_id),
dtype=tf.float32)
for metric in metrics:
metric.update_state(labels, model_outputs, sample_weight)
def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
# `ignore_label_id` will not contribute to metrics.
sample_weight = tf.cast(
tf.not_equal(labels, self.task_config.ignore_label_id),
dtype=tf.float32)
compiled_metrics.update_state(labels, model_outputs, sample_weight)
def initialize(self, model):
"""Load a pretrained checkpoint (if exists) and then train from iter 0."""
ckpt_dir_or_file = self.task_config.init_checkpoint
if tf.io.gfile.isdir(ckpt_dir_or_file):
ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
if not ckpt_dir_or_file:
return
ckpt = tf.train.Checkpoint(**model.checkpoint_items)
status = ckpt.restore(ckpt_dir_or_file)
status.expect_partial().assert_existing_objects_matched()
logging.info('finished loading pretrained checkpoint from %s',
ckpt_dir_or_file)
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for official.nlp.tasks.tagging."""
import functools
import os
import tensorflow as tf
from official.nlp.bert import configs
from official.nlp.bert import export_tfhub
from official.nlp.configs import bert
from official.nlp.configs import encoders
from official.nlp.tasks import tagging
class TaggingTest(tf.test.TestCase):
def setUp(self):
super(TaggingTest, self).setUp()
self._encoder_config = encoders.TransformerEncoderConfig(
vocab_size=30522, num_layers=1)
self._train_data_config = bert.TaggingDataConfig(
input_path="dummy", seq_length=128, global_batch_size=1)
def _run_task(self, config):
task = tagging.TaggingTask(config)
model = task.build_model()
metrics = task.build_metrics()
strategy = tf.distribute.get_strategy()
dataset = strategy.experimental_distribute_datasets_from_function(
functools.partial(task.build_inputs, config.train_data))
iterator = iter(dataset)
optimizer = tf.keras.optimizers.SGD(lr=0.1)
task.train_step(next(iterator), model, optimizer, metrics=metrics)
task.validation_step(next(iterator), model, metrics=metrics)
def test_task(self):
# Saves a checkpoint.
encoder = encoders.instantiate_encoder_from_cfg(self._encoder_config)
ckpt = tf.train.Checkpoint(encoder=encoder)
saved_path = ckpt.save(self.get_temp_dir())
config = tagging.TaggingConfig(
init_checkpoint=saved_path,
network=self._encoder_config,
train_data=self._train_data_config,
num_classes=3)
task = tagging.TaggingTask(config)
model = task.build_model()
metrics = task.build_metrics()
dataset = task.build_inputs(config.train_data)
iterator = iter(dataset)
optimizer = tf.keras.optimizers.SGD(lr=0.1)
task.train_step(next(iterator), model, optimizer, metrics=metrics)
task.validation_step(next(iterator), model, metrics=metrics)
task.initialize(model)
def test_task_with_fit(self):
config = tagging.TaggingConfig(
network=self._encoder_config,
train_data=self._train_data_config,
num_classes=3)
task = tagging.TaggingTask(config)
model = task.build_model()
model = task.compile_model(
model,
optimizer=tf.keras.optimizers.SGD(lr=0.1),
train_step=task.train_step,
metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
dataset = task.build_inputs(config.train_data)
logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
self.assertIn("loss", logs.history)
self.assertIn("accuracy", logs.history)
def _export_bert_tfhub(self):
bert_config = configs.BertConfig(
vocab_size=30522,
hidden_size=16,
intermediate_size=32,
max_position_embeddings=128,
num_attention_heads=2,
num_hidden_layers=1)
_, encoder = export_tfhub.create_bert_model(bert_config)
model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
checkpoint = tf.train.Checkpoint(model=encoder)
checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
with tf.io.gfile.GFile(vocab_file, "w") as f:
f.write("dummy content")
hub_destination = os.path.join(self.get_temp_dir(), "hub")
export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
hub_destination, vocab_file)
return hub_destination
def test_task_with_hub(self):
hub_module_url = self._export_bert_tfhub()
config = tagging.TaggingConfig(
hub_module_url=hub_module_url,
network=self._encoder_config,
num_classes=4,
train_data=self._train_data_config)
self._run_task(config)
if __name__ == "__main__":
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common utils for tasks."""
import tensorflow as tf
import tensorflow_hub as hub
def get_encoder_from_hub(hub_module: str) -> tf.keras.Model:
"""Gets an encoder from hub."""
input_word_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
hub_layer = hub.KerasLayer(hub_module, trainable=True)
pooled_output, sequence_output = hub_layer(
[input_word_ids, input_mask, input_type_ids])
return tf.keras.Model(
inputs=[input_word_ids, input_mask, input_type_ids],
outputs=[sequence_output, pooled_output])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment