"vscode:/vscode.git/clone" did not exist on "963529e29bdb862d1e5babb630f7b12b6c3081c6"
Unverified Commit 50e15c82 authored by Julien Chaumond's avatar Julien Chaumond Committed by GitHub
Browse files

Tokenizers: Start cleaning examples a little (#3455)

* Start cleaning examples

* Fixup
parent b38d552a
...@@ -68,7 +68,7 @@ class GLUETransformer(BaseTransformer): ...@@ -68,7 +68,7 @@ class GLUETransformer(BaseTransformer):
output_mode=args.glue_output_mode, output_mode=args.glue_output_mode,
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=self.tokenizer.pad_token_type_id,
) )
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
......
...@@ -342,8 +342,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -342,8 +342,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
max_length=args.max_seq_length, max_length=args.max_seq_length,
output_mode=output_mode, output_mode=output_mode,
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token=tokenizer.pad_token_id,
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=tokenizer.pad_token_type_id,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
...@@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): ...@@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ["xlnet"]),
# pad on the left for xlnet # pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token=tokenizer.pad_token_id,
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=pad_token_label_id, pad_token_label_id=pad_token_label_id,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
......
...@@ -64,8 +64,8 @@ class NERTransformer(BaseTransformer): ...@@ -64,8 +64,8 @@ class NERTransformer(BaseTransformer):
sep_token=self.tokenizer.sep_token, sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ["roberta"]), sep_token_extra=bool(args.model_type in ["roberta"]),
pad_on_left=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ["xlnet"]),
pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], pad_token=self.tokenizer.pad_token_id,
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=self.tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id, pad_token_label_id=self.pad_token_label_id,
) )
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
...@@ -434,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s ...@@ -434,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(args["model_type"] in ["xlnet"]), pad_on_left=bool(args["model_type"] in ["xlnet"]),
# pad on the left for xlnet # pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token=tokenizer.pad_token_id,
pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, pad_token_segment_id=tokenizer.pad_token_type_id,
pad_token_label_id=pad_token_label_id, pad_token_label_id=pad_token_label_id,
) )
logging.info("Saving features into cached file %s", cached_features_file) logging.info("Saving features into cached file %s", cached_features_file)
......
...@@ -360,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -360,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
max_length=args.max_seq_length, max_length=args.max_seq_length,
output_mode=output_mode, output_mode=output_mode,
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token=tokenizer.pad_token_id,
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=tokenizer.pad_token_type_id,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
...@@ -361,7 +361,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): ...@@ -361,7 +361,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
args.max_seq_length, args.max_seq_length,
tokenizer, tokenizer,
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_segment_id=tokenizer.pad_token_type_id,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
...@@ -350,8 +350,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -350,8 +350,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
max_length=args.max_seq_length, max_length=args.max_seq_length,
output_mode=output_mode, output_mode=output_mode,
pad_on_left=False, pad_on_left=False,
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token=tokenizer.pad_token_id,
pad_token_segment_id=0, pad_token_segment_id=tokenizer.pad_token_type_id,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment