Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
898efca7
Unverified
Commit
898efca7
authored
Apr 19, 2023
by
Matt
Committed by
GitHub
Apr 19, 2023
Browse files
Fix to removing ESM special tokens (#22870)
Fix to make sure the EOS token doesn't come back
parent
a8aad0ec
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
6 deletions
+15
-6
src/transformers/models/esm/tokenization_esm.py
src/transformers/models/esm/tokenization_esm.py
+15
-6
No files found.
src/transformers/models/esm/tokenization_esm.py
View file @
898efca7
...
@@ -54,16 +54,25 @@ class EsmTokenizer(PreTrainedTokenizer):
...
@@ -54,16 +54,25 @@ class EsmTokenizer(PreTrainedTokenizer):
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
]
model_input_names
=
[
"input_ids"
,
"attention_mask"
]
def
__init__
(
self
,
vocab_file
,
**
kwargs
):
def
__init__
(
self
,
vocab_file
,
unk_token
=
"<unk>"
,
cls_token
=
"<cls>"
,
pad_token
=
"<pad>"
,
mask_token
=
"<mask>"
,
eos_token
=
"<eos>"
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
all_tokens
=
load_vocab_file
(
vocab_file
)
self
.
all_tokens
=
load_vocab_file
(
vocab_file
)
self
.
_id_to_token
=
dict
(
enumerate
(
self
.
all_tokens
))
self
.
_id_to_token
=
dict
(
enumerate
(
self
.
all_tokens
))
self
.
_token_to_id
=
{
tok
:
ind
for
ind
,
tok
in
enumerate
(
self
.
all_tokens
)}
self
.
_token_to_id
=
{
tok
:
ind
for
ind
,
tok
in
enumerate
(
self
.
all_tokens
)}
self
.
unk_token
=
"<
unk
>"
self
.
unk_token
=
unk
_token
self
.
cls_token
=
"<
cls
>"
self
.
cls_token
=
cls
_token
self
.
pad_token
=
"<
pad
>"
self
.
pad_token
=
pad
_token
self
.
mask_token
=
"<
mask
>"
self
.
mask_token
=
mask
_token
self
.
eos_token
=
"<
eos
>"
self
.
eos_token
=
eos
_token
self
.
unique_no_split_tokens
=
self
.
all_tokens
self
.
unique_no_split_tokens
=
self
.
all_tokens
self
.
_create_trie
(
self
.
unique_no_split_tokens
)
self
.
_create_trie
(
self
.
unique_no_split_tokens
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment