data_processing.sh 3.6 KB
Newer Older
zihanl's avatar
zihanl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/bash

# Data preparation for our framework: preprocessing the WoW and WoI datasets
# The datasets can be downloaded through the following links:
# WoW: https://parl.ai/projects/wizard_of_wikipedia/
# WoI: https://parl.ai/projects/sea/

DIR=`pwd`
# Before running the preprocessing, please download 
# the wizard of wikipedia and wizard datasets
WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>

# We provide examples for processing the raw data from Wizard of Wikipedia
# Processing the train dataset (train.json)
zihanl's avatar
zihanl committed
16
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
17
18
19
20
21
        --func process_wow_dataset \
        --raw_file ${WOW_DATA_FOLDER}/train.json \
        --processed_file ${WOW_DATA_FOLDER}/train_processed.txt

# Processing test seen dataset (test_random_split.json)
zihanl's avatar
zihanl committed
22
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
23
24
25
26
27
28
29
        --func process_wow_dataset \
        --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
        --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
        --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt

# processing test unseen dataset (test_topic_split.json)
zihanl's avatar
zihanl committed
30
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
31
32
33
34
35
36
37
38
39
        --func process_wow_dataset \
        --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
        --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
        --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt


# We provide the following script to process the raw data from Wizard of Internet
# Processing the test dataset (test.jsonl)
zihanl's avatar
zihanl committed
40
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
41
42
43
44
45
46
47
48
49
50
        --func process_woi_dataset \
        --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
        --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
        --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
        --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt


# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
# WoW test seen
zihanl's avatar
zihanl committed
51
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
52
53
54
55
56
57
58
59
        --func get_knwl_gen_prompts \
        --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
        --data_type wow_seen

# WoW test unseen
zihanl's avatar
zihanl committed
60
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
61
62
63
64
65
66
67
68
        --func get_knwl_gen_prompts \
        --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
        --data_type wow_unseen

# WoI
zihanl's avatar
zihanl committed
69
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
70
71
72
73
74
75
76
77
78
        --func get_knwl_gen_prompts \
        --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --model_file ${MODEL_FILE} \
        --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
        --data_type woi


# Get the response generation prompts (can be applied for all the test datasets)
zihanl's avatar
zihanl committed
79
python ${DIR}/tasks/msdp/preprocessing.py \
zihanl's avatar
zihanl committed
80
81
82
83
        --func get_resp_gen_prompts \
        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
        --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt