Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
32167cdf
Commit
32167cdf
authored
Nov 26, 2018
by
thomwolf
Browse files
remove convert_to_unicode and printable_text from examples
parent
ce37b8e4
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
19 additions
and
32 deletions
+19
-32
examples/extract_features.py
examples/extract_features.py
+2
-2
examples/run_classifier.py
examples/run_classifier.py
+11
-11
examples/run_squad.py
examples/run_squad.py
+5
-6
notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+1
-1
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+0
-12
No files found.
examples/extract_features.py
View file @
32167cdf
...
...
@@ -28,7 +28,7 @@ import torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization
import
convert_to_unicode
,
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
BertModel
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
...
...
@@ -170,7 +170,7 @@ def read_examples(input_file):
unique_id
=
0
with
open
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
convert_to_unicode
(
reader
.
readline
()
)
line
=
reader
.
readline
()
if
not
line
:
break
line
=
line
.
strip
()
...
...
examples/run_classifier.py
View file @
32167cdf
...
...
@@ -30,7 +30,7 @@ import torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization
import
printable_text
,
convert_to_unicode
,
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
BertForSequenceClassification
from
pytorch_pretrained_bert.optimization
import
BertAdam
...
...
@@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
convert_to_unicode
(
line
[
3
]
)
text_b
=
convert_to_unicode
(
line
[
4
]
)
label
=
convert_to_unicode
(
line
[
0
]
)
text_a
=
line
[
3
]
text_b
=
line
[
4
]
label
=
line
[
0
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
...
...
@@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor):
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
convert_to_unicode
(
line
[
0
])
)
text_a
=
convert_to_unicode
(
line
[
8
])
text_b
=
convert_to_unicode
(
line
[
9
])
label
=
convert_to_unicode
(
line
[
-
1
]
)
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
8
])
text_b
=
line
[
9
])
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
...
...
@@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor):
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
convert_to_unicode
(
line
[
3
]
)
label
=
convert_to_unicode
(
line
[
1
]
)
text_a
=
line
[
3
]
label
=
line
[
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
...
...
@@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
printable_text
(
x
)
for
x
in
tokens
]))
[
str
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
...
...
examples/run_squad.py
View file @
32167cdf
...
...
@@ -32,7 +32,7 @@ import torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization
import
printable_text
,
whitespace_tokenize
,
BasicTokenizer
,
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
whitespace_tokenize
,
BasicTokenizer
,
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
BertForQuestionAnswering
from
pytorch_pretrained_bert.optimization
import
BertAdam
...
...
@@ -64,9 +64,9 @@ class SquadExample(object):
def
__repr__
(
self
):
s
=
""
s
+=
"qas_id: %s"
%
(
printable_text
(
self
.
qas_id
)
)
s
+=
"qas_id: %s"
%
(
self
.
qas_id
)
s
+=
", question_text: %s"
%
(
printable_text
(
self
.
question_text
)
)
self
.
question_text
)
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
if
self
.
start_position
:
s
+=
", start_position: %d"
%
(
self
.
start_position
)
...
...
@@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger
.
info
(
"unique_id: %s"
%
(
unique_id
))
logger
.
info
(
"example_index: %s"
%
(
example_index
))
logger
.
info
(
"doc_span_index: %s"
%
(
doc_span_index
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
printable_text
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
tokens
))
logger
.
info
(
"token_to_orig_map: %s"
%
" "
.
join
([
"%d:%d"
%
(
x
,
y
)
for
(
x
,
y
)
in
token_to_orig_map
.
items
()]))
logger
.
info
(
"token_is_max_context: %s"
%
" "
.
join
([
...
...
@@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger
.
info
(
"start_position: %d"
%
(
start_position
))
logger
.
info
(
"end_position: %d"
%
(
end_position
))
logger
.
info
(
"answer: %s"
%
(
printable_text
(
answer_text
))
)
"answer: %s"
%
(
answer_text
))
features
.
append
(
InputFeatures
(
...
...
notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
View file @
32167cdf
...
...
@@ -133,7 +133,7 @@
" unique_id = 0\n",
" with tf.gfile.GFile(input_file, \"r\") as reader:\n",
" while True:\n",
" line = reader.readline()
#tokenization.convert_to_unicode(reader.readline())
\n",
" line = reader.readline()\n",
" if not line:\n",
" break\n",
" line = line.strip()\n",
...
...
pytorch_pretrained_bert/tokenization.py
View file @
32167cdf
...
...
@@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt"
,
}
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment