update preprocessing.py

8c9c145d · zihanl · 91a80bd1 · 8c9c145d
Commit 8c9c145d authored Dec 05, 2021 by zihanl
Show whitespace changes
Inline Side-by-side

Showing with 177 additions and 140 deletions

tasks/knwl_dialo/preprocessing.py tasks/knwl_dialo/preprocessing.py +177 -140

No files found.
--- a/tasks/knwl_dialo/preprocessing.py
+++ b/tasks/knwl_dialo/preprocessing.py
@@ -13,16 +13,20 @@ def get_args():
    parser.add_argument("--func", type=str, default=None,
                        help="choose to run which function")
-    parser.add_argument("--input_file", type=str, default=None,
+    parser.add_argument("--raw_file", type=str, default=None,
                        help="path of the input file")
-    parser.add_argument("--knowledge_file", type=str, default=None,
+    parser.add_argument("--processed_file", type=str, default=None,
-                        help="path of the knowledge file")
+                        help="path of the output file")
+    parser.add_argument("--knwl_ref_file", type=str, default=None,
+                        help="path of the knowledge reference file")
+    parser.add_argument("--resp_ref_file", type=str, default=None,
+                        help="path of the knowledge reference file")
+    parser.add_argument("--knwl_gen_file", type=str, default=None,
+                        help="path of the generated knowledge file")
    parser.add_argument("--test_file", type=str, default=None,
                        help="path of the test file")
    parser.add_argument("--train_file", type=str, default=None,
                        help="path of the train file")
-    parser.add_argument("--output_file", type=str, default=None,
-                        help="path of the output file")
    parser.add_argument("--model_file", type=str, default=None,
                        help="path of the model file")
    parser.add_argument("--data_type", type=str, default=None,
@@ -34,19 +38,22 @@ def get_args():
    return args
-def process_wow_dataset(input_file, output_file):
+def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
    """
      This is a function used for processing the wizard of wikipedia (wow) dataset
      Expected processed format:
      topic \t dialogue context \t golden knowledge \t golden response
    """
-    print("> Loading data from %s" % input_file)
+    print("> Loading data from %s" % raw_file)
-    with open(input_file, "r") as fr:
+    with open(raw_file, "r") as fr:
        dialog_data = json.load(fr)
    print("> Processing data ...")
-    with open(output_file, "w") as fw:
+    fproc = open(processed_file, "w")
+    fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None
+    fresp = open(resp_ref_file, "w") if resp_ref_file else None
    for i, sample in enumerate(tqdm(dialog_data)):
        # get all the dialog data for a single sample
        dialog = sample["dialog"]
@@ -86,26 +93,45 @@ def process_wow_dataset(input_file, output_file):
                else:
                    topic = sample["chosen_topic"]
-                    # write to the output file
+                knowledge = checked_sentence
-                    fw.write(topic + "\t" + " [SEP] ".join(context) + "\t" + \
+                response = text
-                                checked_sentence + "\t" + text + "\n")
+                # write to the output files
+                fproc.write(topic + "\t" + " [SEP] ".join(context) + "\t" + \
+                                knowledge + "\t" + response + "\n")
+                if fknwl:
+                    fknwl.write(knowledge + "\n")
+                if fresp:
+                    # tokenize for evaluation
+                    response = " ".join(word_tokenize(response))
+                    fresp.write(response + "\n")
                context.append(text)
            else:
                assert "apprentice" in speaker
                context.append(text)
+    fproc.close()
+    if fknwl:
+        fknwl.close()
+    if fresp:
+        fresp.close()
-def process_woi_dataset(input_file, output_file):
+def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
    """
      This is a function used for processing the wizard of internet (woi) dataset
      Expected processed format:
      topic \t dialogue context \t golden knowledge \t golden response
    """
-    print("> Processing %s" % input_file)
+    print("> Processing %s" % raw_file)
-    with open(output_file, "w") as fw:
+    fproc = open(processed_file, "w")
-        with open(input_file, "r") as fr:
+    fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None
+    fresp = open(resp_ref_file, "w") if resp_ref_file else None
+    with open(raw_file, "r") as fr:
        for i, line in tqdm(enumerate(fr)):
            line = line.strip()
            item_dict = json.loads(line)
@@ -173,10 +199,16 @@ def process_woi_dataset(input_file, output_file):
                    response = response.replace("\n", "").replace("\r", \
                                "").replace("\t", "")
-                        # write to the ouput file
                    if topic != "no_topic":
-                            fw.write(topic + "\t" + dialog_context + "\t" + \
+                        # write to the ouput files
+                        fproc.write(topic + "\t" + dialog_context + "\t" + \
                                        knwl_sent + "\t" + response + "\n")
+                        if fknwl:
+                            fknwl.write(knwl_sent + "\n")
+                        if fresp:
+                            # tokenize for evaluation
+                            response = " ".join(word_tokenize(response))
+                            fresp.write(response + "\n")
                    turn_list.append(response)
@@ -187,6 +219,12 @@ def process_woi_dataset(input_file, output_file):
                else:
                    assert action == "SearchAgent => Wizard"
+    fproc.close()
+    if fknwl:
+        fknwl.close()
+    if fresp:
+        fresp.close()
 def get_database(test_datapath, train_datapath, data_type):
    """Get the database by topics"""
@@ -465,7 +503,6 @@ def prompt_selection_for_response_generation(input_path, output_path, seed):
            prompt_example_list.append(prompt_example)
    # shuffle the prompt examples
-    print("length: %d" % len(prompt_example_list))
    np.random.shuffle(prompt_example_list)
    print("> writing to %s" % output_path)
@@ -476,17 +513,17 @@ def prompt_selection_for_response_generation(input_path, output_path, seed):
            f.write(example + "\n")
-def prepare_input_for_response_generation(test_file, knowledge_file, output_file):
+def prepare_input_for_response_generation(test_file, knwl_gen_file, processed_file):
    """Preparing inputs for the response generation"""
-    print("> Reading knowledge file from %s" % knowledge_file)
+    print("> Reading knowledge file from %s" % knwl_gen_file)
    # get the knowledge list
-    with open(knowledge_file, "r") as f:
+    with open(knwl_gen_file, "r") as f:
        knowledge_list = f.readlines()
    print("> Processing ...")
    with open(test_file, "r") as fr:
-        with open(output_file, "w") as fw:
+        with open(processed_file, "w") as fw:
            for line_num, line in enumerate(tqdm(fr)):
                line = line.strip()
                splits = line.split("\t")
@@ -508,20 +545,20 @@ if __name__ == "__main__":
    args = get_args()
    if args.func == "process_wow_dataset":
-        process_wow_dataset(args.input_file, args.output_file)
+        process_wow_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file)
    elif args.func == "process_woi_dataset":
-        process_woi_dataset(args.input_file, args.output_file)
+        process_woi_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file)
    elif args.func == "get_knwl_gen_prompts":
        prompt_selection_for_knowledge_generation(
            args.test_file, args.train_file, args.model_file, 
-            args.output_file, args.data_type)
+            args.processed_file, args.data_type)
    elif args.func == "get_resp_gen_prompts":
        prompt_selection_for_response_generation(
-            args.train_file, args.output_file, args.seed)
+            args.train_file, args.processed_file, args.seed)
    elif args.func == "prepare_input":
        prepare_input_for_response_generation(
-            args.test_file, args.knowledge_file, args.output_file)
+            args.test_file, args.knwl_gen_file, args.processed_file)