"docs/source/api/vscode:/vscode.git/clone" did not exist on "fad970aa810d78fadfa032db105b62a6572f99aa"
Commit a79820a6 authored by zihanl's avatar zihanl
Browse files

add preprocessing

parent b1a6d73b
# Multi-Stage Prompting for Knowledgeable Dialogue Generation
We present the steps to run our multi-stage dialogue prompting (MSDP) as well as the finetuning-based baselines (FKG and FCM).
We present the steps to run our multi-stage dialogue prompting (MSDP), as well as the baselines, finetuning-based knowledge generation (FKG) and finetuning-based coversation model (FCM).
## MSDP
## Multi-Stage Dialogue Prompting (MSDP)
### Data Preparation
1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
2. Data Processing: We provide script ```tasks/knwl_dialo/scripts/data_processing.sh``` to process the data.
### Knowledge Generation
1. The script ```tasks/knwl_dialo/scripts/prompt_knwl_gen.sh``` provides an example for how to perform the knowledge generation prompting.
2. The F1 score can be evaluated through ```tasks/knwl_dialo/scripts/eval_generation.sh```. Other automatic metrics follow the [nlg-eval](https://github.com/Maluuba/nlg-eval).
### Response Generation
1. Prepare the input file for the response generation (based on the previously generated knowledge file):
2. The script ```tasks/knwl_dialo/scripts/prompt_resp_gen.sh``` provides an example for how to perform the response generation prompting.
3. The automatic evaluations are the same as mentioned aboved for the knowledge generation.
## FKG
### Knowledge Generation
### Response Generation
## FCM
### Knowledge Generation
### Response Generation
import argparse
from nltk import word_tokenize
from tqdm import tqdm
def get_params():
parser = argparse.ArgumentParser(description="Preprocessing")
parser.add_argument("--func", type=str, default="")
parser.add_argument("--input_file", type=str, default="")
parser.add_argument("--knowledge_file", type=str, default="")
parser.add_argument("--output_file", type=str, default="")
params = parser.parse_args()
return params
def process_wow_dataset(input_file, output_file):
"""
expected processed format:
topic \t dialogue context \t golden knowledge \t golden response
"""
with open(input_file, "r") as fr:
dialog_data = json.load(fr)
with open(output_file, "w") as fw:
for i, sample in enumerate(tqdm(dialog_data)):
dialog = sample["dialog"]
context = []
for j, turn in enumerate(dialog):
text = turn["text"]
if not (text.endswith("?") or text.endswith(".") or text.endswith("!")):
text = text + " ."
text = " ".join(word_tokenize(text))
if j == 0:
# first turn
context.append(text)
continue
speaker = turn["speaker"].lower()
if "wizard" in speaker:
checked_sentence = list(turn["checked_sentence"].values()) # knowledge
checked_passage = list(turn["checked_passage"].values()) # topic
assert len(checked_sentence) <= 1
if len(checked_sentence) > 0:
checked_sentence = checked_sentence[0]
else:
checked_sentence = "no_passages_used"
if len(checked_passage) == 1:
checked_passage = checked_passage[0]
else:
checked_passage = "no_passages_used"
if checked_passage != "no_passages_used":
topic = checked_passage
else:
topic = sample["chosen_topic"]
fw.write(topic + "\t" + " [SEP] ".join(context) + "\t" + checked_sentence + "\t" + text + "\n")
context.append(text)
else:
assert "apprentice" in speaker
context.append(text)
def process_woi_dataset(input_file, output_file):
with open(output_path, "w") as fw:
with open(input_path, "r") as fr:
for i, line in tqdm(enumerate(fr)):
line = line.strip()
item_dict = json.loads(line)
item_dict = item_dict.values()
assert len(item_dict) == 1
item_dict = list(item_dict)[0]
dialog_data = item_dict['dialog_history']
length = len(dialog_data)
turn_list = []
search_text = ""
for i in range(length):
item = dialog_data[i]
action = item['action']
if action == "Wizard => SearchAgent":
search_text = item['text']
elif action == "Wizard => Apprentice":
if len(turn_list) == 0:
turn = item['text']
turn_list.append(turn)
continue
# get knowledge sentence
contents = item["context"]["contents"]
selects = item["context"]["selected_contents"]
flag = selects[0][0]
selects = selects[1:]
assert len(selects) == len(contents)
if flag:
# no knowledge sentence is used
topic = "no_topic"
sent_list = ["no_passages_used"]
else:
# assert search_text != ""
topic = search_text
sent_list = []
for content, select in zip(contents, selects):
content = content['content']
assert len(content) == len(select)
for c, s in zip(content, select):
if s:
sent_list.append(c)
if len(sent_list) == 0:
topic = "no_topic"
sent_list = ["no_passages_used"]
dialog_context = " [SEP] ".join(turn_list)
knwl_sent = sent_list[0]
response = item['text']
topic = topic.replace("\n", "")
topic = topic.replace("\r", "")
topic = topic.replace("\t", "")
dialog_context = dialog_context.replace("\n", "")
dialog_context = dialog_context.replace("\r", "")
dialog_context = dialog_context.replace("\t", "")
knwl_sent = knwl_sent.replace("\n", "")
knwl_sent = knwl_sent.replace("\r", "")
knwl_sent = knwl_sent.replace("\t", "")
response = response.replace("\n", "")
response = response.replace("\r", "")
response = response.replace("\t", "")
if topic != "no_topic":
fw.write(topic + "\t" + dialog_context + "\t" + knwl_sent + "\t" + response + "\n")
turn_list.append(response)
elif action == "Apprentice => Wizard":
turn = item['text']
turn_list.append(turn)
else:
assert action == "SearchAgent => Wizard"
if __name__ == "__main__":
params = get_params()
if params.func == "process_wow_dataset":
process_wow_dataset(params.input_file, params.output_file)
elif params.func == "process_woi_dataset":
process_woi_dataset(params.input_file, params.output_file)
......@@ -104,7 +104,10 @@ def generate_samples_by_prompting_input_from_file(model):
if args.prompt_type == "knowledge":
turns = splits[1].split(" [SEP] ")
context = turns[-1]
raw_text += "( " + context + " ) " + topic + " =>"
if " -> " in raw_text:
raw_text += "( " + context + " ) " + topic + " ->"
else:
raw_text += "( " + context + " ) " + topic + " =>"
else:
# args.prompt_type == "response":
......
#!/bin/bash
DIR=`pwd`
mkdir -p $DIR/tasks/knwl_dialo/data
# We provide the following script to process the raw data from Wizard of Wikipedia
python ${DIR}/tasks/knwl_dialo/preprocessing.py --func process_wow_dataset --input_file <PATH_OF_THE_INPUT_DATA> --output_file <PATH_OF_THE_OUTPUT_DATA>
# We provide the following script to process the raw data from Wizard of Internet
python ${DIR}/tasks/knwl_dialo/preprocessing.py --func process_woi_dataset --input_file <PATH_OF_THE_INPUT_DATA> --output_file <PATH_OF_THE_OUTPUT_DATA>
# Alternatively, we recommend you to directly download the already processed file through:
# wget
#!/bin/bash
WORLD_SIZE=1
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
OUTPUT_PATH=<Speicifc path for the output generation>
GROUND_TRUTH_PATH=<Speicifc path for the ground truth>
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 4 \
--task KNWL-DIALO-EVAL-F1 \
--guess-file ${OUTPUT_PATH} \
--answer-file ${GROUND_TRUTH_PATH}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment