Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
4d16b279
Commit
4d16b279
authored
Aug 28, 2019
by
VictorSanh
Browse files
add `scripts/binarized_data.py`
parent
c513415b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
60 additions
and
0 deletions
+60
-0
examples/distillation/scripts/binarized_data.py
examples/distillation/scripts/binarized_data.py
+60
-0
No files found.
examples/distillation/scripts/binarized_data.py
0 → 100644
View file @
4d16b279
import
argparse
import
pickle
import
random
import
time
import
numpy
as
np
from
pytorch_transformers
import
BertTokenizer
from
..utils
import
logger
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
)
parser
.
add_argument
(
'--file_path'
,
type
=
str
,
default
=
'data/dump.txt'
,
help
=
'The path to the data.'
)
parser
.
add_argument
(
'--bert_tokenizer'
,
type
=
str
,
default
=
'bert-base-uncased'
,
help
=
"The tokenizer to use."
)
parser
.
add_argument
(
'--dump_file'
,
type
=
str
,
default
=
'data/dump'
,
help
=
'The dump file prefix.'
)
args
=
parser
.
parse_args
()
logger
.
info
(
f
'Loading Tokenizer (
{
args
.
bert_tokenizer
}
)'
)
bert_tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_tokenizer
)
logger
.
info
(
f
'Loading text from
{
args
.
file_path
}
'
)
with
open
(
args
.
file_path
,
'r'
,
encoding
=
'utf8'
)
as
fp
:
data
=
fp
.
readlines
()
logger
.
info
(
f
'Start encoding'
)
logger
.
info
(
f
'
{
len
(
data
)
}
examples to process.'
)
rslt
=
[]
iter
=
0
interval
=
10000
start
=
time
.
time
()
for
text
in
data
:
text
=
f
'[CLS]
{
text
.
strip
()
}
[SEP]'
token_ids
=
bert_tokenizer
.
encode
(
text
)
rslt
.
append
(
token_ids
)
iter
+=
1
if
iter
%
interval
==
0
:
end
=
time
.
time
()
logger
.
info
(
f
'
{
iter
}
examples processed. -
{
(
end
-
start
)
/
interval
:.
2
f
}
s/expl'
)
start
=
time
.
time
()
logger
.
info
(
'Finished binarization'
)
logger
.
info
(
f
'
{
len
(
data
)
}
examples processed.'
)
dp_file
=
f
'
{
args
.
dump_file
}
.
{
args
.
bert_tokenizer
}
.pickle'
rslt_
=
[
np
.
uint16
(
d
)
for
d
in
rslt
]
random
.
shuffle
(
rslt_
)
logger
.
info
(
f
'Dump to
{
dp_file
}
'
)
with
open
(
dp_file
,
'wb'
)
as
handle
:
pickle
.
dump
(
rslt_
,
handle
,
protocol
=
pickle
.
HIGHEST_PROTOCOL
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment