Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0dd5f55a
Unverified
Commit
0dd5f55a
authored
Jan 09, 2019
by
Thomas Wolf
Committed by
GitHub
Jan 09, 2019
Browse files
Merge pull request #172 from WrRan/never_split
Never split some texts.
parents
2e4db64c
3f60a60e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
4 deletions
+11
-4
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+11
-4
No files found.
pytorch_pretrained_bert/tokenization.py
View file @
0dd5f55a
...
@@ -75,7 +75,8 @@ def whitespace_tokenize(text):
...
@@ -75,7 +75,8 @@ def whitespace_tokenize(text):
class
BertTokenizer
(
object
):
class
BertTokenizer
(
object
):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
max_len
=
None
):
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
max_len
=
None
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)):
if
not
os
.
path
.
isfile
(
vocab_file
):
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
...
@@ -83,7 +84,8 @@ class BertTokenizer(object):
...
@@ -83,7 +84,8 @@ class BertTokenizer(object):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
...
@@ -156,13 +158,16 @@ class BertTokenizer(object):
...
@@ -156,13 +158,16 @@ class BertTokenizer(object):
class
BasicTokenizer
(
object
):
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
):
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)):
"""Constructs a BasicTokenizer.
"""Constructs a BasicTokenizer.
Args:
Args:
do_lower_case: Whether to lower case the input.
do_lower_case: Whether to lower case the input.
"""
"""
self
.
do_lower_case
=
do_lower_case
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
"""Tokenizes a piece of text."""
...
@@ -177,7 +182,7 @@ class BasicTokenizer(object):
...
@@ -177,7 +182,7 @@ class BasicTokenizer(object):
orig_tokens
=
whitespace_tokenize
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
split_tokens
=
[]
for
token
in
orig_tokens
:
for
token
in
orig_tokens
:
if
self
.
do_lower_case
:
if
self
.
do_lower_case
and
token
not
in
self
.
never_split
:
token
=
token
.
lower
()
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
...
@@ -198,6 +203,8 @@ class BasicTokenizer(object):
...
@@ -198,6 +203,8 @@ class BasicTokenizer(object):
def
_run_split_on_punc
(
self
,
text
):
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
"""Splits punctuation on a piece of text."""
if
text
in
self
.
never_split
:
return
[
text
]
chars
=
list
(
text
)
chars
=
list
(
text
)
i
=
0
i
=
0
start_new_word
=
True
start_new_word
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment