"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "1000bcaeb7d035d692a534280bccf9b710588a94"
Unverified Commit d812e6d7 authored by Stefan Schweter's avatar Stefan Schweter Committed by GitHub
Browse files

NER: fix construction of input examples for RoBERTa (#4943)

* utils_ner: do not add extra sep token for RoBERTa model

* run_pl_ner: do not add extra sep token for RoBERTa model
parent ebab096e
...@@ -65,7 +65,7 @@ class NERTransformer(BaseTransformer): ...@@ -65,7 +65,7 @@ class NERTransformer(BaseTransformer):
cls_token=self.tokenizer.cls_token, cls_token=self.tokenizer.cls_token,
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
sep_token=self.tokenizer.sep_token, sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(self.config.model_type in ["roberta"]), sep_token_extra=False,
pad_on_left=bool(self.config.model_type in ["xlnet"]), pad_on_left=bool(self.config.model_type in ["xlnet"]),
pad_token=self.tokenizer.pad_token_id, pad_token=self.tokenizer.pad_token_id,
pad_token_segment_id=self.tokenizer.pad_token_type_id, pad_token_segment_id=self.tokenizer.pad_token_type_id,
......
...@@ -119,7 +119,7 @@ if is_torch_available(): ...@@ -119,7 +119,7 @@ if is_torch_available():
cls_token=tokenizer.cls_token, cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0, cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token, sep_token=tokenizer.sep_token,
sep_token_extra=bool(model_type in ["roberta"]), sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"), pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id, pad_token=tokenizer.pad_token_id,
...@@ -172,7 +172,7 @@ if is_tf_available(): ...@@ -172,7 +172,7 @@ if is_tf_available():
cls_token=tokenizer.cls_token, cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if model_type in ["xlnet"] else 0, cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
sep_token=tokenizer.sep_token, sep_token=tokenizer.sep_token,
sep_token_extra=bool(model_type in ["roberta"]), sep_token_extra=False,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(tokenizer.padding_side == "left"), pad_on_left=bool(tokenizer.padding_side == "left"),
pad_token=tokenizer.pad_token_id, pad_token=tokenizer.pad_token_id,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment