add preprocessing

a79820a6 · zihanl · b1a6d73b · a79820a6 · a79820a6 · a79820a6
Commit a79820a6 authored Nov 27, 2021 by zihanl
7 changed files
--- a/tasks/knwl_dialo/README.md
+++ b/tasks/knwl_dialo/README.md

 # Multi-Stage Prompting for Knowledgeable Dialogue Generation

-We present the steps to run our multi-stage dialogue prompting (MSDP) as well as the finetuning-based baselines (FKG and FCM).
+We present the steps to run our multi-stage dialogue prompting (MSDP), as well as the baselines, finetuning-based knowledge generation (FKG) and finetuning-based coversation model (FCM).

-## MSDP
+## Multi-Stage Dialogue Prompting (MSDP)
+
+### Data Preparation
+1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
+2. Data Processing: We provide script ```tasks/knwl_dialo/scripts/data_processing.sh``` to process the data.

 ### Knowledge Generation
+1. The script ```tasks/knwl_dialo/scripts/prompt_knwl_gen.sh``` provides an example for how to perform the knowledge generation prompting.
+2. The F1 score can be evaluated through ```tasks/knwl_dialo/scripts/eval_generation.sh```. Other automatic metrics follow the [nlg-eval](https://github.com/Maluuba/nlg-eval).

 ### Response Generation
+1. Prepare the input file for the response generation (based on the previously generated knowledge file):
+2. The script ```tasks/knwl_dialo/scripts/prompt_resp_gen.sh``` provides an example for how to perform the response generation prompting.
+3. The automatic evaluations are the same as mentioned aboved for the knowledge generation.
+
+
+## FKG

+### Knowledge Generation
+
+### Response Generation
+
+## FCM
+
+### Knowledge Generation
+
+### Response Generation

--- a/tasks/knwl_dialo/preprocessing.py
+++ b/tasks/knwl_dialo/preprocessing.py
+
+import argparse
+from nltk import word_tokenize
+from tqdm import tqdm
+
+def get_params():
+    parser = argparse.ArgumentParser(description="Preprocessing")
+
+    parser.add_argument("--func", type=str, default="")
+    parser.add_argument("--input_file", type=str, default="")
+    parser.add_argument("--knowledge_file", type=str, default="")
+    parser.add_argument("--output_file", type=str, default="")
+
+    params = parser.parse_args()
+    return params
+
+
+def process_wow_dataset(input_file, output_file):
+    """
+      expected processed format:
+      topic \t dialogue context \t golden knowledge \t golden response
+    """
+    with open(input_file, "r") as fr:
+        dialog_data = json.load(fr)
+    
+    with open(output_file, "w") as fw:
+        for i, sample in enumerate(tqdm(dialog_data)):
+            dialog = sample["dialog"]
+            
+            context = []
+            for j, turn in enumerate(dialog):
+                text = turn["text"]
+                if not (text.endswith("?") or text.endswith(".") or text.endswith("!")):
+                    text = text + " ."
+                text = " ".join(word_tokenize(text))
+                
+                if j == 0:
+                    # first turn
+                    context.append(text)
+                    continue
+
+                speaker = turn["speaker"].lower()
+                if "wizard" in speaker:
+                    checked_sentence = list(turn["checked_sentence"].values())  # knowledge
+                    checked_passage = list(turn["checked_passage"].values())    # topic
+                    
+                    assert len(checked_sentence) <= 1
+
+                    if len(checked_sentence) > 0:
+                        checked_sentence = checked_sentence[0]
+                    else:
+                        checked_sentence = "no_passages_used"
+
+                    if len(checked_passage) == 1:
+                        checked_passage = checked_passage[0]
+                    else:
+                        checked_passage = "no_passages_used"
+
+                    if checked_passage != "no_passages_used":
+                        topic = checked_passage
+                    else:
+                        topic = sample["chosen_topic"]
+                    
+                    fw.write(topic + "\t" + " [SEP] ".join(context) + "\t" + checked_sentence + "\t" + text + "\n")
+
+                    context.append(text)
+
+                else:
+                    assert "apprentice" in speaker
+                    context.append(text)
+
+
+def process_woi_dataset(input_file, output_file):
+    with open(output_path, "w") as fw:
+        with open(input_path, "r") as fr:
+            for i, line in tqdm(enumerate(fr)):
+                line = line.strip()
+                item_dict = json.loads(line)
+                item_dict = item_dict.values()
+                assert len(item_dict) == 1
+                item_dict = list(item_dict)[0]
+                
+                dialog_data = item_dict['dialog_history']
+                length = len(dialog_data)
+                
+                turn_list = []
+                search_text = ""
+                for i in range(length):
+                    item = dialog_data[i]
+                    action = item['action']
+
+                    if action == "Wizard => SearchAgent":
+                        search_text = item['text']
+
+                    elif action == "Wizard => Apprentice":
+
+                        if len(turn_list) == 0:
+                            turn = item['text']
+                            turn_list.append(turn)
+                            continue
+
+                        # get knowledge sentence
+                        contents = item["context"]["contents"]
+                        selects = item["context"]["selected_contents"]
+                        flag = selects[0][0]
+                        selects = selects[1:]
+                        assert len(selects) == len(contents)
+                        
+                        if flag:
+                            # no knowledge sentence is used
+                            topic = "no_topic"
+                            sent_list = ["no_passages_used"]
+                        else:
+                            # assert search_text != ""
+                            topic = search_text
+
+                            sent_list = []
+                            for content, select in zip(contents, selects):
+                                content = content['content']
+                                assert len(content) == len(select)
+                                for c, s in zip(content, select):
+                                    if s:
+                                        sent_list.append(c)
+                                        
+                        if len(sent_list) == 0:
+                            topic = "no_topic"
+                            sent_list = ["no_passages_used"]
+                        
+                        dialog_context = " [SEP] ".join(turn_list)
+                        knwl_sent = sent_list[0]
+                        response = item['text']
+
+                        topic = topic.replace("\n", "")
+                        topic = topic.replace("\r", "")
+                        topic = topic.replace("\t", "")
+                        
+                        dialog_context = dialog_context.replace("\n", "")
+                        dialog_context = dialog_context.replace("\r", "")
+                        dialog_context = dialog_context.replace("\t", "")
+
+                        knwl_sent = knwl_sent.replace("\n", "")
+                        knwl_sent = knwl_sent.replace("\r", "")
+                        knwl_sent = knwl_sent.replace("\t", "")
+
+                        response = response.replace("\n", "")
+                        response = response.replace("\r", "")
+                        response = response.replace("\t", "")
+                        
+                        if topic != "no_topic":
+                            fw.write(topic + "\t" + dialog_context + "\t" + knwl_sent + "\t" + response + "\n")
+
+                        turn_list.append(response)
+
+                    elif action == "Apprentice => Wizard":
+                        turn = item['text']
+                        turn_list.append(turn)
+
+                    else:
+                        assert action == "SearchAgent => Wizard"
+
+
+
+if __name__ == "__main__":
+
+    params = get_params()
+    if params.func == "process_wow_dataset":
+        process_wow_dataset(params.input_file, params.output_file)
+
+    elif params.func == "process_woi_dataset":
+        process_woi_dataset(params.input_file, params.output_file)
--- a/tasks/knwl_dialo/prompt.py
+++ b/tasks/knwl_dialo/prompt.py
@@ -104,7 +104,10 @@ def generate_samples_by_prompting_input_from_file(model):
                if args.prompt_type == "knowledge":
                    turns = splits[1].split(" [SEP] ")
                    context = turns[-1]
-                    raw_text += "( " + context + " ) " + topic + " =>"
+                    if " -> " in raw_text:
+                        raw_text += "( " + context + " ) " + topic + " ->"
+                    else:
+                        raw_text += "( " + context + " ) " + topic + " =>"
                
                else:
                    # args.prompt_type == "response":

--- a/tasks/knwl_dialo/scripts/data_processing.sh
+++ b/tasks/knwl_dialo/scripts/data_processing.sh
+#!/bin/bash
+
+DIR=`pwd`
+mkdir -p $DIR/tasks/knwl_dialo/data
+
+# We provide the following script to process the raw data from Wizard of Wikipedia
+python ${DIR}/tasks/knwl_dialo/preprocessing.py --func process_wow_dataset --input_file <PATH_OF_THE_INPUT_DATA> --output_file <PATH_OF_THE_OUTPUT_DATA>
+
+# We provide the following script to process the raw data from Wizard of Internet
+python ${DIR}/tasks/knwl_dialo/preprocessing.py --func process_woi_dataset --input_file <PATH_OF_THE_INPUT_DATA> --output_file <PATH_OF_THE_OUTPUT_DATA>
+
+# Alternatively, we recommend you to directly download the already processed file through:
+# wget 
+
--- a/tasks/knwl_dialo/scripts/eval_generation.sh
+++ b/tasks/knwl_dialo/scripts/eval_generation.sh
+#!/bin/bash
+
+WORLD_SIZE=1
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+OUTPUT_PATH=<Speicifc path for the output generation>
+GROUND_TRUTH_PATH=<Speicifc path for the ground truth>
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task KNWL-DIALO-EVAL-F1 \
+        --guess-file ${OUTPUT_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
--- a/examples/prompt_knowledge_generation.sh
+++ b/examples/prompt_knowledge_generation.sh
--- a/examples/prompt_response_generation.sh
+++ b/examples/prompt_response_generation.sh