Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
22ac004a
Commit
22ac004a
authored
Aug 12, 2019
by
LysandreJik
Browse files
Added documentation and changed parameters for special_tokens_sentences_pair.
parent
912fdff8
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
49 additions
and
17 deletions
+49
-17
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+10
-2
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+15
-7
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+1
-1
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+10
-2
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+13
-5
No files found.
pytorch_transformers/tokenization_bert.py
View file @
22ac004a
...
...
@@ -167,12 +167,20 @@ class BertTokenizer(PreTrainedTokenizer):
return
out_string
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to the a sequence for sequence classification tasks.
A BERT sequence has the following format: [CLS] X [SEP]
"""
return
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
+
token_ids
+
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
def
add_special_tokens_sentences_pair
(
self
,
*
token_ids
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
cls
+
token_ids
[
0
]
+
sep
+
token_ids
[
1
]
+
sep
return
cls
+
token_ids
_0
+
sep
+
token_ids
_1
+
sep
def
save_vocabulary
(
self
,
vocab_path
):
"""Save the tokenizer vocabulary to a directory or file."""
...
...
pytorch_transformers/tokenization_roberta.py
View file @
22ac004a
...
...
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for
OpenAI GPT
."""
"""Tokenization classes for
RoBERTa
."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
...
...
@@ -57,15 +57,15 @@ PRETRAINED_VOCAB_FILES_MAP = {
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'roberta-base'
:
1024
,
'roberta-large'
:
1024
,
'roberta-large-mnli'
:
1024
,
'roberta-base'
:
512
,
'roberta-large'
:
512
,
'roberta-large-mnli'
:
512
,
}
class
RobertaTokenizer
(
PreTrainedTokenizer
):
"""
GPT-2
BPE
tokenizer. Peculiarities:
RoBERTa BPE tokenizer, derived from the
GPT-2 tokenizer. Peculiarities:
- Byte-level BPE
"""
vocab_files_names
=
VOCAB_FILES_NAMES
...
...
@@ -161,12 +161,20 @@ class RobertaTokenizer(PreTrainedTokenizer):
return
text
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format: [CLS] X [SEP]
"""
return
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
+
token_ids
+
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
def
add_special_tokens_sentences_pair
(
self
,
*
token_ids
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP]
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
cls
+
token_ids
[
0
]
+
sep
+
sep
+
token_ids
[
1
]
+
sep
return
cls
+
token_ids
_0
+
sep
+
sep
+
token_ids
_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
...
...
pytorch_transformers/tokenization_utils.py
View file @
22ac004a
...
...
@@ -546,7 +546,7 @@ class PreTrainedTokenizer(object):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
def
add_special_tokens_sentences_pair
(
self
,
*
token_ids
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids
_0
,
token_ids_1
):
raise
NotImplementedError
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
...
...
pytorch_transformers/tokenization_xlm.py
View file @
22ac004a
...
...
@@ -215,12 +215,20 @@ class XLMTokenizer(PreTrainedTokenizer):
return
out_string
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP]
"""
return
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
+
token_ids
+
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
def
add_special_tokens_sentences_pair
(
self
,
*
token_ids
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
cls
+
token_ids
[
0
]
+
sep
+
token_ids
[
1
]
+
sep
return
cls
+
token_ids
_0
+
sep
+
token_ids
_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
22ac004a
...
...
@@ -178,14 +178,22 @@ class XLNetTokenizer(PreTrainedTokenizer):
return
out_string
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
logger
.
warning
(
"No method was defined for special tokens and single sentence streams in XLNet. "
"Returning token_ids"
)
return
token_ids
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
token_ids
+
sep
+
cls
def
add_special_tokens_sentences_pair
(
self
,
*
token_ids
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
return
token_ids
[
0
]
+
sep
+
token_ids
[
1
]
+
sep
+
cls
return
token_ids
_0
+
sep
+
token_ids
_1
+
sep
+
cls
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment