#!/bin/bash # Data preparation for our framework: preprocessing the WoW and WoI datasets # The datasets can be downloaded through the following links: # WoW: https://parl.ai/projects/wizard_of_wikipedia/ # WoI: https://parl.ai/projects/sea/ DIR=`pwd` mkdir ${DIR}/tasks/knwl_dialo/data mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_internet # Before running the preprocessing, please download the datasets, # and put them into the corresponding created data folder. # We provide examples for processing the raw data from Wizard of Wikipedia python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func process_wow_dataset \ --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/train.json \ --processed_file python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func process_wow_dataset \ --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_random_split.json \ --processed_file \ --knwl_ref_file \ --resp_ref_file python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func process_wow_dataset \ --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_topic_split.json \ --processed_file \ --knwl_ref_file \ --resp_ref_file # We provide the following script to process the raw data from Wizard of Internet python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func process_woi_dataset \ --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_internet/test.jsonl \ --processed_file \ --knwl_ref_file \ --resp_ref_file # Obtain the knowledge generation prompts for each test dataset (Wizard of Wikipedia test seen/unseen and Wizard of Internet test) python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func get_knwl_gen_prompts \ --test_file \ --train_file \ --model_file \ --processed_file \ --data_type # Obtain the response generation prompts python ${DIR}/tasks/knwl_dialo/preprocessing.py \ --func get_resp_gen_prompts \ --train_file \ --processed_file