data_processing.sh 3.65 KB
Newer Older
zihanl's avatar
zihanl committed
1
2
#!/bin/bash

zihanl's avatar
zihanl committed
3
4
5
6
7
# Data preparation for our framework: preprocessing the WoW and WoI datasets
# The datasets can be downloaded through the following links:
# WoW: https://parl.ai/projects/wizard_of_wikipedia/
# WoI: https://parl.ai/projects/sea/

zihanl's avatar
zihanl committed
8
DIR=`pwd`
zihanl's avatar
zihanl committed
9
10
11
12
# Before running the preprocessing, please download 
# the wizard of wikipedia and wizard datasets
WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
zihanl's avatar
zihanl committed
13

zihanl's avatar
zihanl committed
14
# We provide examples for processing the raw data from Wizard of Wikipedia
zihanl's avatar
zihanl committed
15
# Processing the train dataset (train.json)
zihanl's avatar
zihanl committed
16
17
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
zihanl's avatar
zihanl committed
18
19
        --raw_file ${WOW_DATA_FOLDER}/train.json \
        --processed_file ${WOW_DATA_FOLDER}/train_processed.txt
zihanl's avatar
zihanl committed
20

zihanl's avatar
zihanl committed
21
# Processing test seen dataset (test_random_split.json)
zihanl's avatar
zihanl committed
22
23
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
zihanl's avatar
zihanl committed
24
25
26
27
        --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
        --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
        --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
zihanl's avatar
zihanl committed
28

zihanl's avatar
zihanl committed
29
# processing test unseen dataset (test_topic_split.json)
zihanl's avatar
zihanl committed
30
31
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_wow_dataset \
zihanl's avatar
zihanl committed
32
33
34
35
        --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
        --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
        --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
zihanl's avatar
zihanl committed
36

zihanl's avatar
zihanl committed
37
38

# We provide the following script to process the raw data from Wizard of Internet
zihanl's avatar
zihanl committed
39
# Processing the test dataset (test.jsonl)
zihanl's avatar
zihanl committed
40
41
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func process_woi_dataset \
zihanl's avatar
zihanl committed
42
43
44
45
        --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
        --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
        --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
        --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
zihanl's avatar
zihanl committed
46

zihanl's avatar
zihanl committed
47
48
49
50
51
52
53
54
55
56
57
58
59

# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
# WoW test seen
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_knwl_gen_prompts \
        --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
        --data_type wow_seen

# WoW test unseen
zihanl's avatar
zihanl committed
60
61
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_knwl_gen_prompts \
zihanl's avatar
zihanl committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
        --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
        --data_type wow_unseen

# WoI
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_knwl_gen_prompts \
        --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
        --data_type woi

zihanl's avatar
zihanl committed
77

zihanl's avatar
zihanl committed
78
# Get the response generation prompts (can be applied for all the test datasets)
zihanl's avatar
zihanl committed
79
80
python ${DIR}/tasks/knwl_dialo/preprocessing.py \
        --func get_resp_gen_prompts \
zihanl's avatar
zihanl committed
81
82
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
zihanl's avatar
zihanl committed
83