@@ -77,6 +78,8 @@ class PreTrainedTokenizer(object):
"pad_token","cls_token","mask_token",
"additional_special_tokens"]
padding_side="right"
@property
defbos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
...
...
@@ -190,6 +193,11 @@ class PreTrainedTokenizer(object):
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
returnself.convert_tokens_to_ids(self.pad_token)
@property
defpad_token_type_id(self):
""" Id of the padding token type in the vocabulary."""
returnself._pad_token_type_id
@property
defcls_token_id(self):
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
...
...
@@ -213,10 +221,14 @@ class PreTrainedTokenizer(object):
@@ -243,6 +255,7 @@ class PreTrainedTokenizer(object):
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
...
...
@@ -270,6 +283,9 @@ class PreTrainedTokenizer(object):