Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
e755f6fd
Commit
e755f6fd
authored
Dec 09, 2021
by
zihanl
Browse files
add knwl_dialo scripts
parent
f322c788
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
300 additions
and
0 deletions
+300
-0
examples/knwl_dialo/data_processing.sh
examples/knwl_dialo/data_processing.sh
+83
-0
examples/knwl_dialo/eval_knwl_generation.sh
examples/knwl_dialo/eval_knwl_generation.sh
+43
-0
examples/knwl_dialo/eval_resp_generation.sh
examples/knwl_dialo/eval_resp_generation.sh
+64
-0
examples/knwl_dialo/prep_resp_gen.sh
examples/knwl_dialo/prep_resp_gen.sh
+18
-0
examples/knwl_dialo/prompt_knwl_gen.sh
examples/knwl_dialo/prompt_knwl_gen.sh
+46
-0
examples/knwl_dialo/prompt_resp_gen.sh
examples/knwl_dialo/prompt_resp_gen.sh
+46
-0
No files found.
examples/knwl_dialo/data_processing.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
# Data preparation for our framework: preprocessing the WoW and WoI datasets
# The datasets can be downloaded through the following links:
# WoW: https://parl.ai/projects/wizard_of_wikipedia/
# WoI: https://parl.ai/projects/sea/
DIR
=
`
pwd
`
# Before running the preprocessing, please download
# the wizard of wikipedia and wizard datasets
WOW_DATA_FOLDER
=
<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
WOI_DATA_FOLDER
=
<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
# We provide examples for processing the raw data from Wizard of Wikipedia
# Processing the train dataset (train.json)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
process_wow_dataset
\
--raw_file
${
WOW_DATA_FOLDER
}
/train.json
\
--processed_file
${
WOW_DATA_FOLDER
}
/train_processed.txt
# Processing test seen dataset (test_random_split.json)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
process_wow_dataset
\
--raw_file
${
WOW_DATA_FOLDER
}
/test_random_split.json
\
--processed_file
${
WOW_DATA_FOLDER
}
/testseen_processed.txt
\
--knwl_ref_file
${
WOW_DATA_FOLDER
}
/output_testseen_knowledge_reference.txt
\
--resp_ref_file
${
WOW_DATA_FOLDER
}
/output_testseen_response_reference.txt
# processing test unseen dataset (test_topic_split.json)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
process_wow_dataset
\
--raw_file
${
WOW_DATA_FOLDER
}
/test_topic_split.json
\
--processed_file
${
WOW_DATA_FOLDER
}
/testunseen_processed.txt
\
--knwl_ref_file
${
WOW_DATA_FOLDER
}
/output_testunseen_knowledge_reference.txt
\
--resp_ref_file
${
WOW_DATA_FOLDER
}
/output_testunseen_response_reference.txt
# We provide the following script to process the raw data from Wizard of Internet
# Processing the test dataset (test.jsonl)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
process_woi_dataset
\
--raw_file
${
WOI_DATA_FOLDER
}
/test.jsonl
\
--processed_file
${
WOI_DATA_FOLDER
}
/test_processed.txt
\
--knwl_ref_file
${
WOI_DATA_FOLDER
}
/output_test_knowledge_reference.txt
\
--resp_ref_file
${
WOI_DATA_FOLDER
}
/output_test_response_reference.txt
# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE
=
<PATH_OF_THE_FINETUNED_DPR_MODEL>
# WoW test seen
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
get_knwl_gen_prompts
\
--test_file
${
WOW_DATA_FOLDER
}
/testseen_processed.txt
\
--train_file
${
WOW_DATA_FOLDER
}
/train_processed.txt
\
--model_file
${
MODEL_FILE
}
\
--processed_file
${
WOW_DATA_FOLDER
}
/output_testseen_knowledge_prompts.json
\
--data_type
wow_seen
# WoW test unseen
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
get_knwl_gen_prompts
\
--test_file
${
WOW_DATA_FOLDER
}
/testunseen_processed.txt
\
--train_file
${
WOW_DATA_FOLDER
}
/train_processed.txt
\
--model_file
${
MODEL_FILE
}
\
--processed_file
${
WOW_DATA_FOLDER
}
/output_testunseen_knowledge_prompts.json
\
--data_type
wow_unseen
# WoI
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
get_knwl_gen_prompts
\
--test_file
${
WOI_DATA_FOLDER
}
/test_processed.txt
\
--train_file
${
WOW_DATA_FOLDER
}
/train_processed.txt
\
--model_file
${
MODEL_FILE
}
\
--processed_file
${
WOI_DATA_FOLDER
}
/output_test_knowledge_prompts.json
\
--data_type
woi
# Get the response generation prompts (can be applied for all the test datasets)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
get_resp_gen_prompts
\
--train_file
${
WOW_DATA_FOLDER
}
/train_processed.txt
\
--processed_file
${
WOW_DATA_FOLDER
}
/output_response_prompts.txt
examples/knwl_dialo/eval_knwl_generation.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
#########################
# Evaluate the F1 scores.
#########################
WORLD_SIZE
=
1
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
MODEL_GEN_PATH
=
<PATH_OF_THE_KNOWLEDGE_GENERATION>
\
(
e.g., /testseen_knowledge_generations.txt
)
GROUND_TRUTH_PATH
=
<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
\
(
e.g., /testseen_knowledge_reference.txt
)
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
2048
\
--max-position-embeddings
2048
\
--micro-batch-size
4
\
--task
KNWL-DIALO-EVAL-F1
\
--guess-file
${
MODEL_GEN_PATH
}
\
--answer-file
${
GROUND_TRUTH_PATH
}
############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################
# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
# evaluate the BLEU, METEOR, and ROUGE-L scores.
# To evaluate on these metrics, please setup the environments based on
# the nlg-eval github, and run the corresponding evaluation commands.
nlg-eval
\
--hypothesis
=
<PATH_OF_THE_KNOWLEDGE_GENERATION>
\
--references
=
<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
examples/knwl_dialo/eval_resp_generation.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
#########################
# Evaluate the F1 scores.
#########################
WORLD_SIZE
=
1
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
MODEL_GEN_PATH
=
<PATH_OF_THE_RESPONSE_GENERATION>
\
(
e.g., /testseen_response_generations.txt
)
GROUND_TRUTH_PATH
=
<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
\
(
e.g., /testseen_response_reference.txt
)
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
2048
\
--max-position-embeddings
2048
\
--micro-batch-size
4
\
--task
KNWL-DIALO-EVAL-F1
\
--guess-file
${
MODEL_GEN_PATH
}
\
--answer-file
${
GROUND_TRUTH_PATH
}
##########################
# Evaluate the KF1 scores.
##########################
MODEL_GEN_PATH
=
<PATH_OF_THE_RESPONSE_GENERATION>
\
(
e.g., /testseen_response_generations.txt
)
GROUND_TRUTH_PATH
=
<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
\
(
e.g., /testseen_knowledge_reference.txt
)
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
2048
\
--max-position-embeddings
2048
\
--micro-batch-size
4
\
--task
KNWL-DIALO-EVAL-F1
\
--guess-file
${
MODEL_GEN_PATH
}
\
--answer-file
${
GROUND_TRUTH_PATH
}
############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################
# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
# evaluate the BLEU, METEOR, and ROUGE-L scores.
# To evaluate on these metrics, please setup the environments based on
# the nlg-eval github, and run the corresponding evaluation commands.
nlg-eval
\
--hypothesis
=
<PATH_OF_THE_RESPONSE_GENERATION>
\
--references
=
<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
examples/knwl_dialo/prep_resp_gen.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
# Preparing the input file for the response generation (second-stage prompting)
DIR
=
`
pwd
`
TEST_FILE
=
<PATH_OF_PROCESSED_TEST_DATA>
\
(
e.g., /testseen_processed.txt
)
KNOWLEDGE_FILE
=
<PATH_OF_GENERATED_KNOWLEDGE_DATA>
\
(
e.g., /testseen_knowledge_generations.txt
)
PROCESSED_FILE
=
<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION>
\
(
e.g., /testseen_processed_with_generated_knowledge.txt
)
python
${
DIR
}
/tasks/knwl_dialo/preprocessing.py
\
--func
prepare_input
\
--test_file
${
TEST_FILE
}
\
--knowledge_gen_file
${
KNOWLEDGE_FILE
}
\
--processed_file
${
PROCESSED_FILE
}
examples/knwl_dialo/prompt_knwl_gen.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
# The input contains prompts and current dialogue context, the output is the relevant knowledge
# The size of the pretrained language model is 357M
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
CHECKPOINT_PATH
=
<PATH_OF_LANGUAGE_MODEL>
(
e.g., /357m
)
VOCAB_PATH
=
<PATH_OF_VOCAB_FILE>
(
e.g., /gpt2-vocab.json
)
MERGE_PATH
=
<PATH_OF_MERGE_FILE>
(
e.g., /gpt2-merges.txt
)
INPUT_PATH
=
<PATH_OF_PROCESSED_TEST_DATA_FILE>
\
(
e.g., /testseen_processed.txt
)
PROMPT_PATH
=
<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS>
\
(
e.g., /testseen_knowledge_prompts.json
)
OUTPUT_PATH
=
<PATH_OF_OUTPUT_GENERATION_FILE>
\
(
e.g., /testseen_knowledge_generations.txt
)
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
2048
\
--max-position-embeddings
2048
\
--micro-batch-size
1
\
--vocab-file
${
VOCAB_PATH
}
\
--merge-file
${
MERGE_PATH
}
\
--load
${
CHECKPOINT_PATH
}
\
--fp16
\
--DDP-impl
torch
\
--tokenizer-type
GPT2BPETokenizer
\
--sample-input-file
${
INPUT_PATH
}
\
--sample-output-file
${
OUTPUT_PATH
}
\
--prompt-file
${
PROMPT_PATH
}
\
--prompt-type
knowledge
\
--num-prompt-examples
10
\
--task
KNWL-DIALO-PROMPT
# NOTE: If you use api for the model generation, please use
# the "--api-prompt" flag (setting this value as True).
examples/knwl_dialo/prompt_resp_gen.sh
0 → 100644
View file @
e755f6fd
#!/bin/bash
# Stage-2: Prompt a pretrained language model to generate the corresponding response
# The input contains prompts, current dialogue context, and generated knowledge in Stage-1
# The output is the corresponding response.
# The size of the pretrained language model is 357M
WORLD_SIZE
=
8
DISTRIBUTED_ARGS
=
"--nproc_per_node
$WORLD_SIZE
\
--nnodes 1
\
--node_rank 0
\
--master_addr localhost
\
--master_port 6000"
CHECKPOINT_PATH
=
<PATH_OF_LANGUAGE_MODEL>
(
e.g., /357m
)
VOCAB_PATH
=
<PATH_OF_VOCAB_FILE>
(
e.g., /gpt2-vocab.json
)
MERGE_PATH
=
<PATH_OF_MERGE_FILE>
(
e.g., /gpt2-merges.txt
)
INPUT_PATH
=
<PATH_OF_INPUT_TEST_DATA_FILE>
(
e.g., /testseen_processed.txt
)
PROMPT_PATH
=
<PATH_OF_RESPONSE_GENERATION_PROMPTS>
\
(
e.g., /response_prompts.txt
)
OUTPUT_PATH
=
<PATH_OF_OUTPUT_GENERATION_FILE>
\
(
e.g., /output_testseen_response_generations.txt
)
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
./tasks/main.py
\
--num-layers
24
\
--hidden-size
1024
\
--num-attention-heads
16
\
--seq-length
2048
\
--max-position-embeddings
2048
\
--micro-batch-size
1
\
--vocab-file
${
VOCAB_PATH
}
\
--merge-file
${
MERGE_PATH
}
\
--load
${
CHECKPOINT_PATH
}
\
--fp16
\
--DDP-impl
torch
\
--tokenizer-type
GPT2BPETokenizer
\
--sample-input-file
${
INPUT_PATH
}
\
--sample-output-file
${
OUTPUT_PATH
}
\
--prompt-file
${
PROMPT_PATH
}
\
--prompt-type
response
\
--num-prompt-examples
20
\
--task
KNWL-DIALO-PROMPT
# NOTE: If you use api for the model generation, please use
# the "--api-prompt" flag (setting this value as True).
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment