"...text-generation-inference.git" did not exist on "38cff84a3e1019e760577467299d82112da62de6"
Commit f00b019e authored by zihanl's avatar zihanl
Browse files

update commands

parent 6464742e
...@@ -6,53 +6,78 @@ ...@@ -6,53 +6,78 @@
# WoI: https://parl.ai/projects/sea/ # WoI: https://parl.ai/projects/sea/
DIR=`pwd` DIR=`pwd`
mkdir ${DIR}/tasks/knwl_dialo/data # Before running the preprocessing, please download
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia # the wizard of wikipedia and wizard datasets
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_internet WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
# Before running the preprocessing, please download the datasets, WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
# and put them into the corresponding created data folder.
# We provide examples for processing the raw data from Wizard of Wikipedia # We provide examples for processing the raw data from Wizard of Wikipedia
# Processing the train dataset (train.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \ --func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/train.json \ --raw_file ${WOW_DATA_FOLDER}/train.json \
--processed_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> --processed_file ${WOW_DATA_FOLDER}/train_processed.txt
# Processing test seen dataset (test_random_split.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \ --func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_random_split.json \ --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
--processed_file <PATH_OF_THE_PROCESSED_TEST_SEEN_DATA> \ --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--knwl_ref_file <PATH_OF_THE_TEST_SEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \ --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
--resp_ref_file <PATH_OF_THE_TEST_SEEN_RESPONSE_REFERENCE_OUTPUT_DATA> --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
# processing test unseen dataset (test_topic_split.json)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_wow_dataset \ --func process_wow_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_topic_split.json \ --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
--processed_file <PATH_OF_THE_PROCESSED_TEST_UNSEEN_DATA> \ --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--knwl_ref_file <PATH_OF_THE_TEST_UNSEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \ --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
--resp_ref_file <PATH_OF_THE_TEST_UNSEEN_RESPONSE_REFERENCE_OUTPUT_DATA> --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
# We provide the following script to process the raw data from Wizard of Internet # We provide the following script to process the raw data from Wizard of Internet
# Processing the test dataset (test.jsonl)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func process_woi_dataset \ --func process_woi_dataset \
--raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_internet/test.jsonl \ --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
--processed_file <PATH_OF_THE_PROCESSED_TEST_DATA> \ --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
--knwl_ref_file <PATH_OF_THE_TEST_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \ --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
--resp_ref_file <PATH_OF_THE_TEST_RESPONSE_REFERENCE_OUTPUT_DATA> --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
# Obtain the knowledge generation prompts for each test dataset (Wizard of Wikipedia test seen/unseen and Wizard of Internet test)
# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL>
# WoW test seen
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
--data_type wow_seen
# WoW test unseen
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \ --func get_knwl_gen_prompts \
--test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \ --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \ --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file <PATH_OF_THE_DPR_MODEL> \ --model_file ${MODEL_FILE} \
--processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE> \ --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
--data_type <DATA_TYPE_OF_THE_INPUT_FILE> --data_type wow_unseen
# WoI
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOI_DATA_FOLDER}/test_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
--data_type woi
# Obtain the response generation prompts # Get the response generation prompts (can be applied for all the test datasets)
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func get_resp_gen_prompts \ --func get_resp_gen_prompts \
--train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \ --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE> --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
...@@ -3,8 +3,13 @@ ...@@ -3,8 +3,13 @@
# Preparing the input file for the response generation (second-stage prompting) # Preparing the input file for the response generation (second-stage prompting)
DIR=`pwd` DIR=`pwd`
TEST_FILE=<PATH_OF_THE_PROCESSED_TEST_DATA>
KNOWLEDGE_FILE=<PATH_OF_THE_GENERATED_KNOWLEDGE_DATA>
PROCESSED_FILE=<PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION>
python ${DIR}/tasks/knwl_dialo/preprocessing.py \ python ${DIR}/tasks/knwl_dialo/preprocessing.py \
--func prepare_input \ --func prepare_input \
--test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \ --test_file ${TEST_FILE} \
--knowledge_gen_file <PATH_OF_THE_GENERATED_KNOWLEDGE_DATA> \ --knowledge_gen_file ${KNOWLEDGE_FILE} \
--processed_file <PATH_OF_THE_INPUT_FILE_FOR_RESPONSE_GENERATION> --processed_file ${PROCESSED_FILE}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment