Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fb6c70a9
Commit
fb6c70a9
authored
Nov 12, 2019
by
Louis MARTIN
Committed by
Julien Chaumond
Nov 16, 2019
Browse files
Update tokenization_camembert.py with urls
parent
e44b939e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
2 deletions
+17
-2
transformers/tokenization_camembert.py
transformers/tokenization_camembert.py
+17
-2
No files found.
transformers/tokenization_camembert.py
View file @
fb6c70a9
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License
.
# limitations under the License
""" Tokenization classes for Camembert model."""
""" Tokenization classes for Camembert model."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
unicode_literals
)
...
@@ -20,6 +20,19 @@ import sentencepiece as spm
...
@@ -20,6 +20,19 @@ import sentencepiece as spm
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
transformers.tokenization_utils
import
PreTrainedTokenizer
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'sentencepiece.bpe.model'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'camembert-base'
:
"https://dl.fbaipublicfiles.com/camembert/camembert-base-v0-sentencepiece.bpe.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'camembert-base'
:
None
,
}
class
CamembertTokenizer
(
PreTrainedTokenizer
):
class
CamembertTokenizer
(
PreTrainedTokenizer
):
"""
"""
Adapted from RobertaTokenizer and XLNetTokenizer
Adapted from RobertaTokenizer and XLNetTokenizer
...
@@ -27,7 +40,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -27,7 +40,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
"""
vocab_files_names
=
{
'vocab_file'
:
None
}
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
**
kwargs
):
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
**
kwargs
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment