"comfy/git@developer.sourcefind.cn:chenpangpang/ComfyUI.git" did not exist on "61b3f15f8f2bc0822cb98eac48742fb32f6af396"
Unverified Commit 051dcb2a authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

CamemBERT does not make use of Token Type IDs (#4289)

parent 41e82912
...@@ -102,6 +102,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -102,6 +102,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
...@@ -200,14 +201,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -200,14 +201,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
) -> List[int]: ) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A CamemBERT sequence pair mask has the following format: CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (:obj:`List[int]`):
...@@ -216,15 +210,15 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -216,15 +210,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given :obj:`List[int]`: List of zeros.
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
if token_ids_1 is None: if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property @property
def vocab_size(self): def vocab_size(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment