Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Fairseq
Commits
cab76554
".github/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "49c6961a2dea0ff9aa41e7c2e5cc9fc89ad1fd65"
Commit
cab76554
authored
Oct 12, 2017
by
Louis Martin
Committed by
Myle Ott
Oct 19, 2017
Browse files
Refactor code in Tokenizer
parent
eea50f38
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
25 deletions
+22
-25
fairseq/tokenizer.py
fairseq/tokenizer.py
+22
-25
No files found.
fairseq/tokenizer.py
View file @
cab76554
...
@@ -6,7 +6,9 @@
...
@@ -6,7 +6,9 @@
# can be found in the PATENTS file in the same directory.
# can be found in the PATENTS file in the same directory.
#
#
from
collections
import
Counter
import
re
import
re
import
torch
import
torch
from
fairseq
import
dictionary
from
fairseq
import
dictionary
...
@@ -32,46 +34,41 @@ class Tokenizer:
...
@@ -32,46 +34,41 @@ class Tokenizer:
@
staticmethod
@
staticmethod
def
add_file_to_dictionary
(
filename
,
dict
,
tokenize
):
def
add_file_to_dictionary
(
filename
,
dict
,
tokenize
):
with
open
(
filename
,
'r'
)
as
f
:
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
.
readlines
()
:
for
line
in
f
:
for
word
in
tokenize
(
line
):
for
word
in
tokenize
(
line
):
dict
.
add_symbol
(
word
)
dict
.
add_symbol
(
word
)
dict
.
add_symbol
(
dict
.
eos_word
)
dict
.
add_symbol
(
dict
.
eos_word
)
@
staticmethod
@
staticmethod
def
binarize
(
filename
,
dict
,
consumer
,
tokenize
=
tokenize_line
):
def
binarize
(
filename
,
dict
,
consumer
,
tokenize
=
tokenize_line
):
nseq
,
ntok
,
nunk
=
0
,
0
,
0
nseq
,
ntok
=
0
,
0
replaced
=
{}
replaced
=
Counter
()
def
replaced_consumer
(
word
,
idx
):
if
idx
==
dict
.
unk_index
and
word
!=
dict
.
unk_word
:
replaced
.
update
([
word
])
with
open
(
filename
,
'r'
)
as
f
:
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
for
line
in
f
:
words
=
tokenize
(
line
)
ids
=
Tokenizer
.
tokenize
(
line
,
dict
,
tokenize
,
add_if_not_exist
=
False
,
consumer
=
replaced_consumer
)
nwords
=
len
(
words
)
nseq
+=
1
ids
=
torch
.
IntTensor
(
nwords
+
1
)
nseq
=
nseq
+
1
for
i
in
range
(
0
,
len
(
words
)):
word
=
words
[
i
]
idx
=
dict
.
index
(
word
)
if
idx
==
dict
.
unk_index
and
word
!=
dict
.
unk_word
:
nunk
=
nunk
+
1
if
word
in
replaced
:
replaced
[
word
]
=
replaced
[
word
]
+
1
else
:
replaced
[
word
]
=
1
ids
[
i
]
=
idx
ids
[
nwords
]
=
dict
.
eos_index
consumer
(
ids
)
consumer
(
ids
)
ntok
=
ntok
+
len
(
ids
)
ntok
+
=
len
(
ids
)
return
{
'nseq'
:
nseq
,
'nunk'
:
nunk
,
'ntok'
:
ntok
,
'replaced'
:
len
(
replaced
)}
return
{
'nseq'
:
nseq
,
'nunk'
:
sum
(
replaced
.
values
())
,
'ntok'
:
ntok
,
'replaced'
:
len
(
replaced
)}
@
staticmethod
@
staticmethod
def
tokenize
(
line
,
dict
,
tokenize
=
tokenize_line
,
add_if_not_exist
=
True
):
def
tokenize
(
line
,
dict
,
tokenize
=
tokenize_line
,
add_if_not_exist
=
True
,
consumer
=
None
):
words
=
tokenize
(
line
)
words
=
tokenize
(
line
)
nwords
=
len
(
words
)
nwords
=
len
(
words
)
ids
=
torch
.
IntTensor
(
nwords
+
1
)
ids
=
torch
.
IntTensor
(
nwords
+
1
)
for
i
in
range
(
0
,
len
(
words
)
)
:
for
i
,
word
in
enumerate
(
words
):
if
add_if_not_exist
:
if
add_if_not_exist
:
id
s
[
i
]
=
dict
.
add_symbol
(
word
s
[
i
]
)
id
x
=
dict
.
add_symbol
(
word
)
else
:
else
:
ids
[
i
]
=
dict
.
index
(
words
[
i
])
idx
=
dict
.
index
(
word
)
if
consumer
is
not
None
:
consumer
(
word
,
idx
)
ids
[
i
]
=
idx
ids
[
nwords
]
=
dict
.
eos_index
ids
[
nwords
]
=
dict
.
eos_index
return
ids
return
ids
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment