update commands

f00b019e · zihanl · 6464742e · f00b019e · f00b019e
Commit f00b019e authored Dec 06, 2021 by zihanl
Showing with 61 additions and 31 deletions

tasks/knwl_dialo/scripts/data_processing.sh tasks/knwl_dialo/scripts/data_processing.sh +53 -28

tasks/knwl_dialo/scripts/prep_resp_gen.sh tasks/knwl_dialo/scripts/prep_resp_gen.sh +8 -3

No files found.
--- a/tasks/knwl_dialo/scripts/data_processing.sh
+++ b/tasks/knwl_dialo/scripts/data_processing.sh
@@ -6,53 +6,78 @@
 # WoI: https://parl.ai/projects/sea/

 DIR=`pwd`
-mkdir ${DIR}/tasks/knwl_dialo/data
-mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia
-mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_internet
-# Before running the preprocessing, please download the datasets, 
-# and put them into the corresponding created data folder.
+# Before running the preprocessing, please download 
+# the wizard of wikipedia and wizard datasets
+WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
+WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>

 # We provide examples for processing the raw data from Wizard of Wikipedia
+# Processing the train dataset (train.json)
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
-        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/train.json \
-        --processed_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA>
+        --raw_file ${WOW_DATA_FOLDER}/train.json \
+        --processed_file ${WOW_DATA_FOLDER}/train_processed.txt

+# Processing test seen dataset (test_random_split.json)
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
-        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_random_split.json \
-        --processed_file <PATH_OF_THE_PROCESSED_TEST_SEEN_DATA> \
-        --knwl_ref_file <PATH_OF_THE_TEST_SEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
-        --resp_ref_file <PATH_OF_THE_TEST_SEEN_RESPONSE_REFERENCE_OUTPUT_DATA>
+        --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
+        --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
+        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
+        --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt

+# processing test unseen dataset (test_topic_split.json)
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
-        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_topic_split.json \
-        --processed_file <PATH_OF_THE_PROCESSED_TEST_UNSEEN_DATA> \
-        --knwl_ref_file <PATH_OF_THE_TEST_UNSEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
-        --resp_ref_file <PATH_OF_THE_TEST_UNSEEN_RESPONSE_REFERENCE_OUTPUT_DATA>
+        --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
+        --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
+        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
+        --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt


 # We provide the following script to process the raw data from Wizard of Internet
+# Processing the test dataset (test.jsonl)
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_woi_dataset \
-        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_internet/test.jsonl \
-        --processed_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
-        --knwl_ref_file <PATH_OF_THE_TEST_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
-        --resp_ref_file <PATH_OF_THE_TEST_RESPONSE_REFERENCE_OUTPUT_DATA>
+        --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
+        --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
+        --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
+        --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt

-# Obtain the knowledge generation prompts for each test dataset (Wizard of Wikipedia test seen/unseen and Wizard of Internet test)
+
+# Get the knowledge generation prompts for the each test dataset in WoW and WoI
+MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
+# WoW test seen
+python ${DIR}/tasks/knwl_dialo/preprocessing.py \
+        --func get_knwl_gen_prompts \
+        --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
+        --data_type wow_seen
+
+# WoW test unseen
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_knwl_gen_prompts \
-        --test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
-        --train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
-        --model_file <PATH_OF_THE_DPR_MODEL> \
-        --processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE> \
-        --data_type <DATA_TYPE_OF_THE_INPUT_FILE>
+        --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
+        --data_type wow_unseen
+
+# WoI
+python ${DIR}/tasks/knwl_dialo/preprocessing.py \
+        --func get_knwl_gen_prompts \
+        --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
+        --data_type woi
+

-# Obtain the response generation prompts
+# Get the response generation prompts (can be applied for all the test datasets)
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_resp_gen_prompts \
-        --train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
-        --processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE>
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt

--- a/tasks/knwl_dialo/scripts/prep_resp_gen.sh
+++ b/tasks/knwl_dialo/scripts/prep_resp_gen.sh
@@ -3,8 +3,13 @@
 # Preparing the input file for the response generation (second-stage prompting)

 DIR=`pwd`
+
+TEST_FILE=<PATH_OF_THE_PROCESSED_TEST_DATA>
+KNOWLEDGE_FILE=<PATH_OF_THE_GENERATED_KNOWLEDGE_DATA>
+PROCESSED_FILE=<PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
+
 python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func prepare_input \
-        --test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
-        --knowledge_gen_file <PATH_OF_THE_GENERATED_KNOWLEDGE_DATA> \
-        --processed_file <PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
+        --test_file ${TEST_FILE} \
+        --knowledge_gen_file ${KNOWLEDGE_FILE} \
+        --processed_file ${PROCESSED_FILE}