"mmdet3d/vscode:/vscode.git/clone" did not exist on "148fea12e00a3f460cfeadd719100701ca63e5ff"
Commit 3d99cc89 authored by Maxim Neumann's avatar Maxim Neumann Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 314412294
parent 2af9696b
...@@ -462,7 +462,7 @@ class QnliProcessor(DataProcessor): ...@@ -462,7 +462,7 @@ class QnliProcessor(DataProcessor):
class TfdsProcessor(DataProcessor): class TfdsProcessor(DataProcessor):
"""Processor for generic text classification TFDS data set. """Processor for generic text classification and regression TFDS data set.
The TFDS parameters are expected to be provided in the tfds_params string, in The TFDS parameters are expected to be provided in the tfds_params string, in
a comma-separated list of parameter assignments. a comma-separated list of parameter assignments.
...@@ -473,6 +473,8 @@ class TfdsProcessor(DataProcessor): ...@@ -473,6 +473,8 @@ class TfdsProcessor(DataProcessor):
tfds_params="dataset=glue/sst2,text_key=sentence" tfds_params="dataset=glue/sst2,text_key=sentence"
tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence" tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2" tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2,"
"is_regression=true,label_type=float"
Possible parameters (please refer to the documentation of Tensorflow Datasets Possible parameters (please refer to the documentation of Tensorflow Datasets
(TFDS) for the meaning of individual parameters): (TFDS) for the meaning of individual parameters):
dataset: Required dataset name (potentially with subset and version number). dataset: Required dataset name (potentially with subset and version number).
...@@ -487,6 +489,8 @@ class TfdsProcessor(DataProcessor): ...@@ -487,6 +489,8 @@ class TfdsProcessor(DataProcessor):
test_text_key: Key of the text feature to use in test set. test_text_key: Key of the text feature to use in test set.
test_text_b_key: Key of the second text feature to use in test set. test_text_b_key: Key of the second text feature to use in test set.
test_label: String to be used as the label for all test examples. test_label: String to be used as the label for all test examples.
label_type: Type of the label key (defaults to `int`).
is_regression: Whether the task is a regression problem (defaults to False).
""" """
def __init__(self, tfds_params, def __init__(self, tfds_params,
...@@ -498,10 +502,16 @@ class TfdsProcessor(DataProcessor): ...@@ -498,10 +502,16 @@ class TfdsProcessor(DataProcessor):
self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir, self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir,
with_info=True) with_info=True)
self._labels = list(range(info.features[self.label_key].num_classes)) if self.is_regression:
self._labels = None
else:
self._labels = list(range(info.features[self.label_key].num_classes))
def _process_tfds_params_str(self, params_str): def _process_tfds_params_str(self, params_str):
"""Extracts TFDS parameters from a comma-separated assignements string.""" """Extracts TFDS parameters from a comma-separated assignements string."""
dtype_map = {"int": int, "float": float}
cast_str_to_bool = lambda s: s.lower() not in ["false", "0"]
tuples = [x.split("=") for x in params_str.split(",")] tuples = [x.split("=") for x in params_str.split(",")]
d = {k.strip(): v.strip() for k, v in tuples} d = {k.strip(): v.strip() for k, v in tuples}
self.dataset_name = d["dataset"] # Required. self.dataset_name = d["dataset"] # Required.
...@@ -516,6 +526,8 @@ class TfdsProcessor(DataProcessor): ...@@ -516,6 +526,8 @@ class TfdsProcessor(DataProcessor):
self.test_text_key = d.get("test_text_key", self.text_key) self.test_text_key = d.get("test_text_key", self.text_key)
self.test_text_b_key = d.get("test_text_b_key", self.text_b_key) self.test_text_b_key = d.get("test_text_b_key", self.text_b_key)
self.test_label = d.get("test_label", "test_example") self.test_label = d.get("test_label", "test_example")
self.label_type = dtype_map[d.get("label_type", "int")]
self.is_regression = cast_str_to_bool(d.get("is_regression", "False"))
def get_train_examples(self, data_dir): def get_train_examples(self, data_dir):
assert data_dir is None assert data_dir is None
...@@ -553,7 +565,7 @@ class TfdsProcessor(DataProcessor): ...@@ -553,7 +565,7 @@ class TfdsProcessor(DataProcessor):
text_a = self.process_text_fn(example[self.text_key]) text_a = self.process_text_fn(example[self.text_key])
if self.text_b_key: if self.text_b_key:
text_b = self.process_text_fn(example[self.text_b_key]) text_b = self.process_text_fn(example[self.text_b_key])
label = int(example[self.label_key]) label = self.label_type(example[self.label_key])
examples.append( examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples return examples
...@@ -563,8 +575,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, ...@@ -563,8 +575,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer): tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`.""" """Converts a single `InputExample` into a single `InputFeatures`."""
label_map = {} label_map = {}
for (i, label) in enumerate(label_list): if label_list:
label_map[label] = i for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a) tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None tokens_b = None
...@@ -632,7 +645,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, ...@@ -632,7 +645,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
assert len(input_mask) == max_seq_length assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length assert len(segment_ids) == max_seq_length
label_id = label_map[example.label] label_id = label_map[example.label] if label_map else example.label
if ex_index < 5: if ex_index < 5:
logging.info("*** Example ***") logging.info("*** Example ***")
logging.info("guid: %s", (example.guid)) logging.info("guid: %s", (example.guid))
...@@ -654,7 +667,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, ...@@ -654,7 +667,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
def file_based_convert_examples_to_features(examples, label_list, def file_based_convert_examples_to_features(examples, label_list,
max_seq_length, tokenizer, max_seq_length, tokenizer,
output_file): output_file, label_type=None):
"""Convert a set of `InputExample`s to a TFRecord file.""" """Convert a set of `InputExample`s to a TFRecord file."""
tf.io.gfile.makedirs(os.path.dirname(output_file)) tf.io.gfile.makedirs(os.path.dirname(output_file))
...@@ -670,12 +683,18 @@ def file_based_convert_examples_to_features(examples, label_list, ...@@ -670,12 +683,18 @@ def file_based_convert_examples_to_features(examples, label_list,
def create_int_feature(values): def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f return f
def create_float_feature(values):
f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return f
features = collections.OrderedDict() features = collections.OrderedDict()
features["input_ids"] = create_int_feature(feature.input_ids) features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_int_feature(feature.input_mask) features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids) features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature([feature.label_id]) if label_type is not None and label_type == float:
features["label_ids"] = create_float_feature([feature.label_id])
else:
features["label_ids"] = create_int_feature([feature.label_id])
features["is_real_example"] = create_int_feature( features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)]) [int(feature.is_real_example)])
...@@ -731,18 +750,23 @@ def generate_tf_record_from_data_file(processor, ...@@ -731,18 +750,23 @@ def generate_tf_record_from_data_file(processor,
assert train_data_output_path or eval_data_output_path assert train_data_output_path or eval_data_output_path
label_list = processor.get_labels() label_list = processor.get_labels()
label_type = getattr(processor, "label_type", None)
is_regression = getattr(processor, "is_regression", False)
assert train_data_output_path assert train_data_output_path
train_input_data_examples = processor.get_train_examples(data_dir) train_input_data_examples = processor.get_train_examples(data_dir)
file_based_convert_examples_to_features(train_input_data_examples, label_list, file_based_convert_examples_to_features(train_input_data_examples, label_list,
max_seq_length, tokenizer, max_seq_length, tokenizer,
train_data_output_path) train_data_output_path,
label_type)
num_training_data = len(train_input_data_examples) num_training_data = len(train_input_data_examples)
if eval_data_output_path: if eval_data_output_path:
eval_input_data_examples = processor.get_dev_examples(data_dir) eval_input_data_examples = processor.get_dev_examples(data_dir)
file_based_convert_examples_to_features(eval_input_data_examples, file_based_convert_examples_to_features(eval_input_data_examples,
label_list, max_seq_length, label_list, max_seq_length,
tokenizer, eval_data_output_path) tokenizer, eval_data_output_path,
label_type)
if test_data_output_path: if test_data_output_path:
test_input_data_examples = processor.get_test_examples(data_dir) test_input_data_examples = processor.get_test_examples(data_dir)
...@@ -751,19 +775,25 @@ def generate_tf_record_from_data_file(processor, ...@@ -751,19 +775,25 @@ def generate_tf_record_from_data_file(processor,
file_based_convert_examples_to_features( file_based_convert_examples_to_features(
examples, examples,
label_list, max_seq_length, label_list, max_seq_length,
tokenizer, test_data_output_path.format(language)) tokenizer, test_data_output_path.format(language),
label_type)
else: else:
file_based_convert_examples_to_features(test_input_data_examples, file_based_convert_examples_to_features(test_input_data_examples,
label_list, max_seq_length, label_list, max_seq_length,
tokenizer, test_data_output_path) tokenizer, test_data_output_path,
label_type)
meta_data = { meta_data = {
"task_type": "bert_classification",
"processor_type": processor.get_processor_name(), "processor_type": processor.get_processor_name(),
"num_labels": len(processor.get_labels()),
"train_data_size": num_training_data, "train_data_size": num_training_data,
"max_seq_length": max_seq_length, "max_seq_length": max_seq_length,
} }
if is_regression:
meta_data["task_type"] = "bert_regression"
meta_data["label_type"] = {int: "int", float: "float"}[label_type]
else:
meta_data["task_type"] = "bert_classification"
meta_data["num_labels"] = len(processor.get_labels())
if eval_data_output_path: if eval_data_output_path:
meta_data["eval_data_size"] = len(eval_input_data_examples) meta_data["eval_data_size"] = len(eval_input_data_examples)
......
...@@ -35,7 +35,8 @@ from official.nlp.data import squad_lib_sp ...@@ -35,7 +35,8 @@ from official.nlp.data import squad_lib_sp
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
flags.DEFINE_enum( flags.DEFINE_enum(
"fine_tuning_task_type", "classification", ["classification", "squad"], "fine_tuning_task_type", "classification",
["classification", "regression", "squad"],
"The name of the BERT fine tuning task for which data " "The name of the BERT fine tuning task for which data "
"will be generated..") "will be generated..")
...@@ -181,6 +182,34 @@ def generate_classifier_dataset(): ...@@ -181,6 +182,34 @@ def generate_classifier_dataset():
max_seq_length=FLAGS.max_seq_length) max_seq_length=FLAGS.max_seq_length)
def generate_regression_dataset():
"""Generates regression dataset and returns input meta data."""
if FLAGS.tokenizer_impl == "word_piece":
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
processor_text_fn = tokenization.convert_to_unicode
else:
assert FLAGS.tokenizer_impl == "sentence_piece"
tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
processor_text_fn = functools.partial(
tokenization.preprocess_text, lower=FLAGS.do_lower_case)
if FLAGS.tfds_params:
processor = classifier_data_lib.TfdsProcessor(
tfds_params=FLAGS.tfds_params,
process_text_fn=processor_text_fn)
return classifier_data_lib.generate_tf_record_from_data_file(
processor,
None,
tokenizer,
train_data_output_path=FLAGS.train_data_output_path,
eval_data_output_path=FLAGS.eval_data_output_path,
test_data_output_path=FLAGS.test_data_output_path,
max_seq_length=FLAGS.max_seq_length)
else:
raise ValueError("No data processor found for the given regression task.")
def generate_squad_dataset(): def generate_squad_dataset():
"""Generates squad training dataset and returns input meta data.""" """Generates squad training dataset and returns input meta data."""
assert FLAGS.squad_data_file assert FLAGS.squad_data_file
...@@ -210,6 +239,8 @@ def main(_): ...@@ -210,6 +239,8 @@ def main(_):
if FLAGS.fine_tuning_task_type == "classification": if FLAGS.fine_tuning_task_type == "classification":
input_meta_data = generate_classifier_dataset() input_meta_data = generate_classifier_dataset()
elif FLAGS.fine_tuning_task_type == "regression":
input_meta_data = generate_regression_dataset()
else: else:
input_meta_data = generate_squad_dataset() input_meta_data = generate_squad_dataset()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment