data_processing.sh 2.79 KB
Newer Older
zihanl's avatar
zihanl committed
1
2
#!/bin/bash

zihanl's avatar
zihanl committed
3
4
5
6
7
# Data preparation for our framework: preprocessing the WoW and WoI datasets
# The datasets can be downloaded through the following links:
# WoW: https://parl.ai/projects/wizard_of_wikipedia/
# WoI: https://parl.ai/projects/sea/

zihanl's avatar
zihanl committed
8
DIR=`pwd`
zihanl's avatar
zihanl committed
9
10
11
mkdir ${DIR}/tasks/knwl_dialo/data
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia
mkdir ${DIR}/tasks/knwl_dialo/data/wizard_of_internet
zihanl's avatar
zihanl committed
12
13
# Before running the preprocessing, please download the datasets, 
# and put them into the corresponding created data folder.
zihanl's avatar
zihanl committed
14

zihanl's avatar
zihanl committed
15
# We provide examples for processing the raw data from Wizard of Wikipedia
zihanl's avatar
zihanl committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/train.json \
        --processed_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA>

python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_random_split.json \
        --processed_file <PATH_OF_THE_PROCESSED_TEST_SEEN_DATA> \
        --knwl_ref_file <PATH_OF_THE_TEST_SEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
        --resp_ref_file <PATH_OF_THE_TEST_SEEN_RESPONSE_REFERENCE_OUTPUT_DATA>

python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_wikipedia/test_topic_split.json \
        --processed_file <PATH_OF_THE_PROCESSED_TEST_UNSEEN_DATA> \
        --knwl_ref_file <PATH_OF_THE_TEST_UNSEEN_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
        --resp_ref_file <PATH_OF_THE_TEST_UNSEEN_RESPONSE_REFERENCE_OUTPUT_DATA>

zihanl's avatar
zihanl committed
35
36

# We provide the following script to process the raw data from Wizard of Internet
zihanl's avatar
zihanl committed
37
38
39
40
41
42
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_woi_dataset \
        --raw_file ${DIR}/tasks/knwl_dialo/data/wizard_of_internet/test.jsonl \
        --processed_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
        --knwl_ref_file <PATH_OF_THE_TEST_KNOWLEDGE_REFERENCE_OUTPUT_DATA> \
        --resp_ref_file <PATH_OF_THE_TEST_RESPONSE_REFERENCE_OUTPUT_DATA>
zihanl's avatar
zihanl committed
43

zihanl's avatar
zihanl committed
44
# Obtain the knowledge generation prompts for each test dataset (Wizard of Wikipedia test seen/unseen and Wizard of Internet test)
zihanl's avatar
zihanl committed
45
46
47
48
49
50
51
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_knwl_gen_prompts \
        --test_file <PATH_OF_THE_PROCESSED_TEST_DATA> \
        --train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
        --model_file <PATH_OF_THE_DPR_MODEL> \
        --processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE> \
        --data_type <DATA_TYPE_OF_THE_INPUT_FILE>
zihanl's avatar
zihanl committed
52

root's avatar
root committed
53
# Obtain the response generation prompts
zihanl's avatar
zihanl committed
54
55
56
57
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_resp_gen_prompts \
        --train_file <PATH_OF_THE_PROCESSED_WOW_TRAIN_DATA> \
        --processed_file <PATH_OF_THE_OUTPUT_PROMPT_FILE>
zihanl's avatar
zihanl committed
58