Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6c5297a4
Commit
6c5297a4
authored
Dec 05, 2019
by
thomwolf
Browse files
Fixing camembert tokenization
parent
1f179f09
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
2 deletions
+5
-2
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+5
-2
No files found.
transformers/tokenization_camembert.py
View file @
6c5297a4
...
@@ -51,7 +51,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -51,7 +51,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
additional_special_tokens
=
[
'<s>NOTUSED'
,
'<s>NOTUSED'
],
**
kwargs
):
additional_special_tokens
=
[
'<s>NOTUSED'
,
'<
/
s>NOTUSED'
],
**
kwargs
):
super
(
CamembertTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
super
(
CamembertTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
...
@@ -125,7 +125,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -125,7 +125,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
@
property
@
property
def
vocab_size
(
self
):
def
vocab_size
(
self
):
return
self
.
fairseq_
offset
+
len
(
self
.
sp_model
)
return
len
(
self
.
fairseq_
tokens_to_ids
)
+
len
(
self
.
sp_model
)
def
_tokenize
(
self
,
text
):
def
_tokenize
(
self
,
text
):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
@@ -134,6 +134,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -134,6 +134,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str/unicode) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_tokens_to_ids
[
token
]
elif
self
.
sp_model
.
PieceToId
(
token
)
==
0
:
# Convert sentence piece unk token to fairseq unk token index
return
self
.
unk_token_id
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment