Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
3ec549ba
Commit
3ec549ba
authored
Dec 06, 2021
by
zihanl
Browse files
add comments
parent
08b00288
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
27 deletions
+36
-27
tasks/knwl_dialo/preprocessing.py
tasks/knwl_dialo/preprocessing.py
+35
-23
tasks/knwl_dialo/prompt.py
tasks/knwl_dialo/prompt.py
+1
-4
No files found.
tasks/knwl_dialo/preprocessing.py
View file @
3ec549ba
...
...
@@ -46,6 +46,7 @@ def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
topic
\t
dialogue context
\t
golden knowledge
\t
golden response
"""
# loading the raw data
print
(
"> Loading data from %s"
%
raw_file
)
with
open
(
raw_file
,
"r"
)
as
fr
:
dialog_data
=
json
.
load
(
fr
)
...
...
@@ -56,18 +57,20 @@ def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
fresp
=
open
(
resp_ref_file
,
"w"
)
if
resp_ref_file
else
None
for
i
,
sample
in
enumerate
(
tqdm
(
dialog_data
)):
# get all the dialog data for a single sample
# get all the dialog data for a single
dialog
sample
dialog
=
sample
[
"dialog"
]
context
=
[]
turn_list
=
[]
# collect the dialog history
# processing for each single dialog sample
for
j
,
turn
in
enumerate
(
dialog
):
# text of each turn
text
=
turn
[
"text"
]
if
not
(
text
.
endswith
(
"?"
)
or
text
.
endswith
(
"."
)
or
text
.
endswith
(
"!"
)):
text
=
text
+
"."
if
j
==
0
:
# first turn
contex
t
.
append
(
text
)
turn_lis
t
.
append
(
text
)
continue
speaker
=
turn
[
"speaker"
].
lower
()
...
...
@@ -94,10 +97,14 @@ def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
else
:
topic
=
sample
[
"chosen_topic"
]
dialog_context
=
" [SEP] "
.
join
(
turn_list
)
knowledge
=
checked_sentence
response
=
text
# add the response into the dialog history
turn_list
.
append
(
response
)
# write to the output files
fproc
.
write
(
topic
+
"
\t
"
+
" [SEP] "
.
join
(
context
)
+
"
\t
"
+
\
fproc
.
write
(
topic
+
"
\t
"
+
dialog_
context
+
"
\t
"
+
\
knowledge
+
"
\t
"
+
response
+
"
\n
"
)
if
fknwl
:
...
...
@@ -107,11 +114,9 @@ def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
response
=
" "
.
join
(
word_tokenize
(
response
))
fresp
.
write
(
response
+
"
\n
"
)
context
.
append
(
text
)
else
:
assert
"apprentice"
in
speaker
contex
t
.
append
(
text
)
turn_lis
t
.
append
(
text
)
fproc
.
close
()
if
fknwl
:
...
...
@@ -134,16 +139,20 @@ def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
with
open
(
raw_file
,
"r"
)
as
fr
:
for
i
,
line
in
tqdm
(
enumerate
(
fr
)):
# read line by line, each line uses json format
line
=
line
.
strip
()
item_dict
=
json
.
loads
(
line
)
# item_dict is a dictionary
# its key is the data id, and its value contains all the data content
item_dict
=
item_dict
.
values
()
assert
len
(
item_dict
)
==
1
item_dict
=
list
(
item_dict
)[
0
]
item_dict
=
list
(
item_dict
)[
0
]
# len(item_dict) == 1
# get the whole dialog data for a single dialog sample
dialog_data
=
item_dict
[
'dialog_history'
]
length
=
len
(
dialog_data
)
turn_list
=
[]
turn_list
=
[]
# collect the dialog history
search_text
=
""
for
i
in
range
(
length
):
item
=
dialog_data
[
i
]
...
...
@@ -154,6 +163,7 @@ def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
elif
action
==
"Wizard => Apprentice"
:
if
len
(
turn_list
)
==
0
:
# first turn
turn
=
item
[
'text'
]
turn_list
.
append
(
turn
)
continue
...
...
@@ -167,27 +177,29 @@ def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
# get the topic
if
flag
:
# no knowledge sentence is used
# no knowledge sentence is used
for the response
topic
=
"no_topic"
sent_lis
t
=
[
"no_passages_used"
]
knwl_sen
t
=
"no_passages_used"
else
:
#
assert
search
_
text
!= ""
#
we consider the
search
text
as the topic
topic
=
search_text
sent_lis
t
=
[]
# get the knowledge sentence
knwl_sen
t
=
""
for
content
,
select
in
zip
(
contents
,
selects
):
content
=
content
[
'content'
]
assert
len
(
content
)
==
len
(
select
)
for
c
,
s
in
zip
(
content
,
select
):
if
s
:
sent_list
.
append
(
c
)
if
len
(
sent_list
)
==
0
:
knwl_sent
=
c
break
if
knwl_sent
==
""
:
# no knowledge is used for the response
topic
=
"no_topic"
sent_lis
t
=
[
"no_passages_used"
]
knwl_sen
t
=
"no_passages_used"
# get dialogue context, knowledge, and response
dialog_context
=
" [SEP] "
.
join
(
turn_list
)
knwl_sent
=
sent_list
[
0
]
response
=
item
[
'text'
]
# processing
...
...
@@ -218,7 +230,8 @@ def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
turn_list
.
append
(
turn
)
else
:
assert
action
==
"SearchAgent => Wizard"
assert
action
==
"SearchAgent => Wizard"
,
\
"Please check whether you have used the correct data!"
fproc
.
close
()
if
fknwl
:
...
...
@@ -232,7 +245,8 @@ def get_database(test_datapath, train_datapath, data_type):
assert
data_type
in
[
"wow_seen"
,
"wow_unseen"
,
"woi"
],
\
"Please input a correct data type!!"
# get test data topic list
# get test data topic dictionary
print
(
"> reading test data from %s"
%
test_datapath
)
test_topics
=
{}
with
open
(
test_datapath
,
"r"
)
as
f
:
...
...
@@ -265,8 +279,6 @@ def get_database(test_datapath, train_datapath, data_type):
# get the instance
last_turn
=
turns
[
-
1
]
if
data_type
==
"woi"
:
instance
=
"( "
+
last_turn
+
" ) "
+
topic
+
" -> "
+
knowledge
else
:
instance
=
"( "
+
last_turn
+
" ) "
+
topic
+
" => "
+
knowledge
# construct dialog example
...
...
tasks/knwl_dialo/prompt.py
View file @
3ec549ba
...
...
@@ -110,10 +110,7 @@ def generate_samples_by_prompting_input_from_file(model):
# construct inputs for knowledge generation
turns
=
splits
[
1
].
split
(
" [SEP] "
)
context
=
turns
[
-
1
]
if
" -> "
in
raw_text
and
" => "
not
in
raw_text
:
raw_text
+=
"( "
+
context
+
" ) "
+
topic
+
" ->"
else
:
raw_text
+=
"( "
+
context
+
" ) "
+
topic
+
" =>"
raw_text
+=
"( "
+
context
+
" ) "
+
topic
+
" =>"
else
:
# construct inputs for response generation
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment