Commit f00b019e authored by zihanl's avatar zihanl
Browse files

update commands

parent 6464742e
......@@ -6,53 +6,78 @@
# WoI: https://parl.ai/projects/sea/
DIR=`pwd`
mkdir ${DIR}/tasks/knwl_dialo/data
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_internet
# Before running the preprocessing, please download the datasets,
# and put them into the corresponding created data folder.
# Before running the preprocessing, please download
# the wizard of wikipedia and wizard datasets
WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
# We provide examples for processing the raw data from Wizard of Wikipedia
# Processing the train dataset (train.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/train.json \
--processed_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA>
--raw_file ${WOW_DATA_FOLDER}/train.json \
--processed_file ${WOW_DATA_FOLDER}/train_processed.txt
# Processing test seen dataset (test_random_split.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_random_split.json \
--processed_file <PATH_OF_THE_PROCESSED_TEST_SEEN_DATA> \
--knwl_ref_file <PATH_OF_THE_TEST_SEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
--resp_ref_file <PATH_OF_THE_TEST_SEEN_RESPONSE_REFERENCE_OUTPUT_DATA>
--raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
--processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
--resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
# processing test unseen dataset (test_topic_split.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_topic_split.json \
--processed_file <PATH_OF_THE_PROCESSED_TEST_UNSEEN_DATA> \
--knwl_ref_file <PATH_OF_THE_TEST_UNSEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
--resp_ref_file <PATH_OF_THE_TEST_UNSEEN_RESPONSE_REFERENCE_OUTPUT_DATA>
--raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
--processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
--resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
# We provide the following script to process the raw data from Wizard of Internet
# Processing the test dataset (test.jsonl)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_woi_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_internet/test.jsonl \
--processed_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
--knwl_ref_file <PATH_OF_THE_TEST_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
--resp_ref_file <PATH_OF_THE_TEST_RESPONSE_REFERENCE_OUTPUT_DATA>
--raw_file ${WOI_DATA_FOLDER}/test.jsonl \
--processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
--knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
--resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
# Obtain the knowledge generation prompts for each test dataset (Wizard of Wikipedia test seen/unseen and Wizard of Internet test)
# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL>
# WoW test seen
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
--data_type wow_seen
# WoW test unseen
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
--train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
--model_file <PATH_OF_THE_DPR_MODEL> \
--processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE> \
--data_type <DATA_TYPE_OF_THE_INPUT_FILE>
--test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
--data_type wow_unseen
# WoI
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOI_DATA_FOLDER}/test_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
--data_type woi
# Obtain the response generation prompts
# Get the response generation prompts (can be applied for all the test datasets)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_resp_gen_prompts \
--train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
--processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE>
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
......@@ -3,8 +3,13 @@
# Preparing the input file for the response generation (second-stage prompting)
DIR=`pwd`
TEST_FILE=<PATH_OF_THE_PROCESSED_TEST_DATA>
KNOWLEDGE_FILE=<PATH_OF_THE_GENERATED_KNOWLEDGE_DATA>
PROCESSED_FILE=<PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func prepare_input \
--test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
--knowledge_gen_file <PATH_OF_THE_GENERATED_KNOWLEDGE_DATA> \
--processed_file <PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
--test_file ${TEST_FILE} \
--knowledge_gen_file ${KNOWLEDGE_FILE} \
--processed_file ${PROCESSED_FILE}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment