Merge branch 'multistage-prompting/main-multistage' into 'main'

Multistage prompting/main multistage See merge request ADLR/megatron-lm!371

Merge branch 'multistage-prompting/main-multistage' into 'main'
Multistage prompting/main multistage See merge request ADLR/megatron-lm!371
5833553a · Jared Casper · 945ece94 · b8707ee2 · 5833553a · 5833553a
Commit 5833553a authored Dec 21, 2021 by Jared Casper
13 changed files
--- a/examples/msdp/README.md
+++ b/examples/msdp/README.md
+
+# Multi-Stage Prompting for Knowledgeable Dialogue Generation
+
+This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
+
--- a/examples/msdp/data_processing.sh
+++ b/examples/msdp/data_processing.sh
+#!/bin/bash
+
+# Data preparation for our framework: preprocessing the WoW and WoI datasets
+# The datasets can be downloaded through the following links:
+# WoW: https://parl.ai/projects/wizard_of_wikipedia/
+# WoI: https://parl.ai/projects/sea/
+
+DIR=`pwd`
+# Before running the preprocessing, please download 
+# the wizard of wikipedia and wizard datasets
+WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
+WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
+
+# We provide examples for processing the raw data from Wizard of Wikipedia
+# Processing the train dataset (train.json)
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func process_wow_dataset \
+        --raw_file ${WOW_DATA_FOLDER}/train.json \
+        --processed_file ${WOW_DATA_FOLDER}/train_processed.txt
+
+# Processing test seen dataset (test_random_split.json)
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func process_wow_dataset \
+        --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
+        --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
+        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
+        --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
+
+# processing test unseen dataset (test_topic_split.json)
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func process_wow_dataset \
+        --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
+        --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
+        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
+        --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
+
+
+# We provide the following script to process the raw data from Wizard of Internet
+# Processing the test dataset (test.jsonl)
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func process_woi_dataset \
+        --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
+        --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
+        --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
+        --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
+
+
+# Get the knowledge generation prompts for the each test dataset in WoW and WoI
+MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
+# WoW test seen
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func get_knwl_gen_prompts \
+        --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
+        --data_type wow_seen
+
+# WoW test unseen
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func get_knwl_gen_prompts \
+        --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
+        --data_type wow_unseen
+
+# WoI
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func get_knwl_gen_prompts \
+        --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --model_file ${MODEL_FILE} \
+        --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
+        --data_type woi
+
+
+# Get the response generation prompts (can be applied for all the test datasets)
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func get_resp_gen_prompts \
+        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
+        --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
+
--- a/examples/msdp/eval_knwl_generation.sh
+++ b/examples/msdp/eval_knwl_generation.sh
+#!/bin/bash
+
+#########################
+# Evaluate the F1 scores.
+#########################
+
+WORLD_SIZE=1
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
+        (e.g., /testseen_knowledge_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
+        (e.g., /testseen_knowledge_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+############################################
+# Evaluate BLEU, METEOR, and ROUGE-L scores.
+############################################
+
+# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
+# evaluate the BLEU, METEOR, and ROUGE-L scores. 
+
+# To evaluate on these metrics, please setup the environments based on 
+# the nlg-eval github, and run the corresponding evaluation commands.
+
+nlg-eval \
+    --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
+    --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
--- a/examples/msdp/eval_resp_generation.sh
+++ b/examples/msdp/eval_resp_generation.sh
+#!/bin/bash
+
+#########################
+# Evaluate the F1 scores.
+#########################
+
+WORLD_SIZE=1
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
+        (e.g., /testseen_response_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
+        (e.g., /testseen_response_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+##########################
+# Evaluate the KF1 scores.
+##########################
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
+        (e.g., /testseen_response_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
+        (e.g., /testseen_knowledge_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+############################################
+# Evaluate BLEU, METEOR, and ROUGE-L scores.
+############################################
+
+# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
+# evaluate the BLEU, METEOR, and ROUGE-L scores. 
+
+# To evaluate on these metrics, please setup the environments based on 
+# the nlg-eval github, and run the corresponding evaluation commands.
+
+nlg-eval \
+    --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
+    --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
--- a/examples/msdp/prep_resp_gen.sh
+++ b/examples/msdp/prep_resp_gen.sh
+#!/bin/bash
+
+# Preparing the input file for the response generation (second-stage prompting)
+
+DIR=`pwd`
+
+TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
+        (e.g., /testseen_processed.txt)
+KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
+        (e.g., /testseen_knowledge_generations.txt)
+PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
+        (e.g., /testseen_processed_with_generated_knowledge.txt)
+
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func prepare_input \
+        --test_file ${TEST_FILE} \
+        --knowledge_gen_file ${KNOWLEDGE_FILE} \
+        --processed_file ${PROCESSED_FILE}
--- a/examples/msdp/prompt_knwl_gen.sh
+++ b/examples/msdp/prompt_knwl_gen.sh
+#!/bin/bash
+
+# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
+# The input contains prompts and current dialogue context, the output is the relevant knowledge
+# The size of the pretrained language model is 357M
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
+VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
+MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
+INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
+        (e.g., /testseen_processed.txt)
+PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
+        (e.g., /testseen_knowledge_prompts.json)
+OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
+        (e.g., /testseen_knowledge_generations.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 1 \
+        --vocab-file ${VOCAB_PATH} \
+        --merge-file ${MERGE_PATH} \
+        --load ${CHECKPOINT_PATH} \
+        --fp16 \
+        --DDP-impl torch \
+        --tokenizer-type GPT2BPETokenizer \
+        --sample-input-file ${INPUT_PATH} \
+        --sample-output-file ${OUTPUT_PATH} \
+        --prompt-file ${PROMPT_PATH} \
+        --prompt-type knowledge \
+        --num-prompt-examples 10 \
+        --task MSDP-PROMPT 
+
+# NOTE: If you use api for the model generation, please use 
+# the "--api-prompt" flag (setting this value as True). 
--- a/examples/msdp/prompt_resp_gen.sh
+++ b/examples/msdp/prompt_resp_gen.sh
+#!/bin/bash
+
+# Stage-2: Prompt a pretrained language model to generate the corresponding response
+# The input contains prompts, current dialogue context, and generated knowledge in Stage-1
+# The output is the corresponding response.
+# The size of the pretrained language model is 357M
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
+VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
+MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
+INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
+PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
+        (e.g., /response_prompts.txt)
+OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
+        (e.g., /output_testseen_response_generations.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 1 \
+        --vocab-file ${VOCAB_PATH} \
+        --merge-file ${MERGE_PATH} \
+        --load ${CHECKPOINT_PATH} \
+        --fp16 \
+        --DDP-impl torch \
+        --tokenizer-type GPT2BPETokenizer \
+        --sample-input-file ${INPUT_PATH} \
+        --sample-output-file ${OUTPUT_PATH} \
+        --prompt-file ${PROMPT_PATH} \
+        --prompt-type response \
+        --num-prompt-examples 20 \
+        --task MSDP-PROMPT 
+
+# NOTE: If you use api for the model generation, please use 
+# the "--api-prompt" flag (setting this value as True). 
--- a/tasks/msdp/README.md
+++ b/tasks/msdp/README.md
+
+# Multi-Stage Prompting for Knowledgeable Dialogue Generation
+
+Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
+
+## Multi-Stage Dialogue Prompting
+
+### Data Preparation
+1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
+2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
+
+### Stage-1: Prompting for Knowledge Generation
+1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
+2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
+
+### Stage-2: Prompting for Response Generation
+1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
+2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
+3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
--- a/tasks/msdp/evaluate.py
+++ b/tasks/msdp/evaluate.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model evaluation"""
+
+from megatron import get_args
+from megatron import print_rank_0
+from tasks.msdp.metrics import F1Metric
+from tqdm import tqdm
+
+
+def evaluate_f1(guess_file, answer_file):
+    """Evaluating F1 Score"""
+
+    guess_list = []
+    print_rank_0('reading %s' % guess_file)
+    with open(guess_file, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            line = line.strip()
+            if "<|endoftext|>" in line:
+                line = line.replace("<|endoftext|>", "")
+            guess_list.append(line)
+
+    answer_list = []
+    print_rank_0('reading %s' % answer_file)
+    with open(answer_file, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            line = line.strip()
+            if line == "no_passages_used":
+                line = ""
+            answer_list.append(line)
+
+    assert len(guess_list) == len(answer_list), \
+        "lengths of guess and answer are different!"
+
+    precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
+    print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
+
+    print_rank_0('done :-)')
+
+
+def main():
+    args = get_args()
+    
+    evaluate_f1(args.guess_file, args.answer_file)
+
--- a/tasks/msdp/main.py
+++ b/tasks/msdp/main.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run multi-stage dialogue prompting (MSDP)."""
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
+from megatron import get_args
+from megatron.initialize import initialize_megatron
+
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='tasks')
+
+    # parameters for the knowledgeable dialogue generation
+    group.add_argument('--task', type=str, required=True,
+                       help='Task name.')
+    group.add_argument("--sample-input-file", type=str, default=None,
+                       help='Get input from file instead of interactive mode, '
+                       'each line is an input.')
+    group.add_argument("--sample-output-file", type=str, default=None,
+                       help='Output file got from --sample-input-file')
+    group.add_argument('--prompt-file', type=str, default=None,
+                       help='prompting file')
+    group.add_argument('--prompt-type', type=str, default=None, 
+                       choices=['knowledge', 'response'],
+                       help='prompt type (knowledge or response)')
+    group.add_argument('--num-prompt-examples', type=int, default=10,
+                       help='number of prompt examples')
+    group.add_argument('--guess-file', type=str, default=None,
+                       help='datapath for generated sentences')
+    group.add_argument('--answer-file', type=str, default=None,
+                       help='datapath for golden sentences')
+    group.add_argument('--out-seq-length', type=int, default=100,
+                       help='output sequence length')
+    group.add_argument('--api-prompt', default=False, action="store_true",
+                       help='setup model api for prompting')
+    group.add_argument('--megatron-api-url', type=str, default=None,
+                       help='url of the megatron api')
+
+    return parser
+
+
+if __name__ == '__main__':
+
+    initialize_megatron(extra_args_provider=get_tasks_args)
+
+    args = get_args()
+
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
+        exit()
+
+    if args.task == 'MSDP-PROMPT':
+        from tasks.msdp.prompt import main
+
+    elif args.task == 'MSDP-EVAL-F1':
+        from tasks.msdp.evaluate import main
+
+    else:
+        raise NotImplementedError('Task {} is not implemented.'.format(
+            args.task))
+
+    main()
--- a/tasks/msdp/metrics.py
+++ b/tasks/msdp/metrics.py
+
+# The following code is adapted from
+# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
+# which is licensed under the MIT license. More details on the license can be 
+# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
+
+"""Provides standard metric evaluations for dialog."""
+
+from collections import Counter
+from typing import List
+import numpy as np
+import re
+
+re_art = re.compile(r'\b(a|an|the)\b')
+re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+
+
+def normalize_answer(s):
+    """
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+    s = s.lower()
+    s = re_punc.sub(' ', s)
+    s = re_art.sub(' ', s)
+    s = ' '.join(s.split())
+    return s
+
+
+class F1Metric:
+    """
+    Helper class which computes token-level F1.
+    """
+
+    @staticmethod
+    def _prec_recall_f1_score(pred_items, gold_items):
+        """
+        Compute precision, recall and f1 given a set of gold and prediction items.
+        :param pred_items: iterable of predicted values
+        :param gold_items: iterable of gold values
+        :return: tuple (p, r, f1) for precision, recall, f1
+        """
+        common = Counter(gold_items) & Counter(pred_items)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0, 0, 0
+        precision = 1.0 * num_same / len(pred_items)
+        recall = 1.0 * num_same / len(gold_items)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return precision, recall, f1
+
+    @staticmethod
+    def compute_each_pair(guess: str, answer: str):
+        if answer == "":
+            return None, None, None
+        if guess == "":
+            return 0, 0, 0
+        g_tokens = normalize_answer(guess).split()
+        a_tokens = normalize_answer(answer).split()
+
+        precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
+        return precision, recall, f1
+        
+    @staticmethod
+    def compute_all_pairs(guesses: List[str], answers: List[str]):
+        # additional augment:
+        assert len(guesses) == len(answers)
+        
+        precision_list, recall_list, f1_list = [], [], []
+        for guess, answer in zip(guesses, answers):
+            precision, recall, f1 = F1Metric.compute_each_pair(guess, answer)
+            if precision is None or recall is None or f1 is None:
+                continue
+            precision_list.append(precision)
+            recall_list.append(recall)
+            f1_list.append(f1)
+        
+        return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
--- a/tasks/msdp/preprocessing.py
+++ b/tasks/msdp/preprocessing.py
--- a/tasks/msdp/prompt.py
+++ b/tasks/msdp/prompt.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prompting the pretrained language model to generate knowledge/response"""
+
+import json
+import torch
+import requests
+from nltk import word_tokenize
+from megatron import mpu
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron.model import GPTModel
+from megatron.training import get_model
+from megatron.checkpointing import load_checkpoint
+from megatron.initialize import initialize_megatron
+from megatron.text_generation import generate_and_post_process
+
+
+def call_model_api(inputs, tokens_to_generate):
+    """Calling the model api to get the output generations"""
+    
+    args = get_args()
+
+    # The following is an example of using the Megatron API
+    # You can also implement your own API function to place this part
+    headers = {'Content-Type': 'application/json; charset=UTF-8'}
+    data = {"prompts": [inputs], "tokens_to_generate": tokens_to_generate, "top_k": 1}
+    data_json = json.dumps(data)
+    outputs = requests.put(args.megatron_api_url, headers=headers, data=data_json).json()["text"][0]
+
+    input_len = len(inputs)
+    outputs = outputs[input_len:]
+    outputs = outputs.split("\n")[0].strip()
+    
+    return outputs
+
+
+def read_prompts(prompt_path, prompt_type, n_example):
+    """Read prompt data"""
+
+    if prompt_type == "knowledge":
+        # prompts for the knowledge generation
+        prompt_examples_dict = {}
+        # read prompt_path
+        with open(prompt_path, "r") as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                line_dict = json.loads(line)
+                key = list(line_dict.keys())[0]
+                
+                if key not in prompt_examples_dict:
+                    prompt_examples = line_dict[key]
+                    prompt = ""
+                    for instance in prompt_examples:
+                        instance = instance.strip()
+                        prompt += instance + " \n"
+                    prompt_examples_dict[key] = prompt
+
+        return prompt_examples_dict
+
+    else:
+        # prompts for the response generation
+        # read prompt_path
+        prompt = ""
+        with open(prompt_path, "r") as f:
+            prompt_examples = f.readlines()
+            prompt_examples = prompt_examples[:n_example]
+            for instance in prompt_examples:
+                instance = instance.strip()
+                prompt += instance + " \n"
+
+        return prompt
+
+
+def generate_samples_by_calling_api():
+    """ Generate outputs by calling"""
+    args = get_args()
+    assert args.prompt_type in ["knowledge", "response"], \
+                "Please input a correct prompt type!"
+
+    if args.prompt_type == "knowledge":
+        # read knowledge generation prompts
+        knwl_gen_prompt_dict = read_prompts(
+            args.prompt_file, args.prompt_type, args.num_prompt_examples)
+        
+    else:
+        resp_gen_prompt = read_prompts(
+            args.prompt_file, args.prompt_type, args.num_prompt_examples)
+
+    # read the test data
+    fname = open(args.sample_input_file, "r")
+    test_sample_list = fname.readlines()
+    # create output file
+    fname_out = open(args.sample_output_file, "w")
+
+    # call the api to get the output generations
+    for test_sample in test_sample_list:
+        test_sample = test_sample.strip()
+        splits = test_sample.split("\t")
+        topic = splits[0]
+
+        # prepare the inputs for the api
+        if args.prompt_type == "knowledge":
+            ## inputs = prompt + current test
+            # get the prompt
+            turns = splits[1].split(" [SEP] ")
+            last_turn = turns[-1]
+            key = topic + " " + last_turn
+            inputs = knwl_gen_prompt_dict[key]
+
+            # add current test
+            inputs += "( " + last_turn + " ) " + topic + " =>"
+
+        else:
+            # inputs = prompt + current test
+            # get the prompt
+            inputs = resp_gen_prompt
+
+            # add current test
+            turns = splits[1].split(" [SEP] ")
+            knowledge = splits[2]
+            last_turn = turns[-1]
+            last_turn = " ".join(word_tokenize(last_turn))
+            knowledge = " ".join(word_tokenize(knowledge))
+            knowledge = knowledge.strip()
+            last_turn = last_turn.strip()
+            inputs += "Topic: " + topic + ". "
+            inputs += "User says: " + last_turn + " "
+            inputs += "We know that: " + knowledge + " "
+            inputs += "System replies:"
+
+        # get the output generations from the api, 
+        # and write to the output file
+        generations = call_model_api(inputs, args.out_seq_length)
+        fname_out.write(generations)
+        fname_out.write("\n")
+
+    fname.close()
+    fname_out.close()
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    model = GPTModel(
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    return model
+
+
+def generate_samples_by_prompting_input_from_file(model):
+    """Prompt a pretrained language model to generate knowledge/response"""
+    
+    # get tokenizer
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Read the sample file and open the output file.
+    assert args.sample_input_file is not None, \
+        'sample input file is not provided.'
+    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+        fname = open(args.sample_input_file, "r")
+        all_raw_text = fname.readlines()
+        input_count = len(all_raw_text)
+        if args.sample_output_file is None:
+            sample_output_file = args.sample_input_file + ".out"
+            print('`sample-output-file` not specified, setting '
+                    'it to {}'.format(sample_output_file))
+        else:
+            sample_output_file = args.sample_output_file
+
+        fname_out = open(sample_output_file, "w")
+
+    # only two prompt types (i.e., knowledge and response) are allowed
+    assert args.prompt_type in ["knowledge", "response"], \
+                "Please input a correct prompt type!"
+
+    # Read the prompt file
+    if args.prompt_type == "knowledge":
+        # read the prompts for the knowledge generation
+        prompt_examples_dict = {}
+        with open(args.prompt_file, "r") as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                line_dict = json.loads(line)
+                key = list(line_dict.keys())[0]
+
+                # get the prompt examples based on the key
+                if key not in prompt_examples_dict:
+                    prompt_examples = line_dict[key]
+                    prompt = ""
+                    for instance in prompt_examples:
+                        instance = instance.strip()
+                        prompt += instance + " \n"
+                    prompt_examples_dict[key] = prompt
+
+    else:
+        # read the prompts for the response generation
+        # prompts are fixed for all test samples
+        with open(args.prompt_file, "r") as f:
+            prompt_examples = f.readlines()
+            prompt_examples = prompt_examples[:args.num_prompt_examples]
+
+            prompt = ""
+            for instance in prompt_examples:
+                instance = instance.strip()
+                prompt += instance + " \n"
+
+    input_pos = 0
+    model.eval()
+    # perform prompting
+    with torch.no_grad():
+        while True:
+            raw_text_len = 0
+            if mpu.is_pipeline_first_stage() \
+               and mpu.get_tensor_model_parallel_rank() == 0:
+                input_str = all_raw_text[input_pos]
+                input_str = input_str.strip()
+                splits = input_str.split("\t")
+                topic = splits[0]
+
+                if args.prompt_type == "knowledge":
+                    # first add the prompt into the raw_text
+                    turns = splits[1].split(" [SEP] ")
+                    last_turn = turns[-1]
+                    key = topic + " " + last_turn
+                    raw_text = prompt_examples_dict[key]
+
+                    # construct inputs for knowledge generation
+                    # then add the constructed inputs into the raw_text
+                    raw_text += "( " + last_turn + " ) " + topic + " =>"
+                
+                else:
+                    # first add the prompt into the raw_text
+                    raw_text = prompt
+
+                    # construct inputs for response generation
+                    # then add the constructed inputs into the raw_text
+                    turns = splits[1].split(" [SEP] ")
+                    knowledge = splits[2]
+                    last_turn = turns[-1]
+                    last_turn = " ".join(word_tokenize(last_turn))
+                    knowledge = " ".join(word_tokenize(knowledge))
+                    knowledge = knowledge.strip()
+                    last_turn = last_turn.strip()
+                    raw_text += "Topic: " + topic + ". "
+                    raw_text += "User says: " + last_turn + " "
+                    raw_text += "We know that: " + knowledge + " "
+                    raw_text += "System replies:"
+
+                input_pos += 1
+                raw_text_len = len(raw_text)
+            
+            else:
+                raw_text = "EMPTY TEXT"
+
+            if input_pos % 100 == 0:
+                print_rank_0("input_pos: %d" % input_pos)
+
+            outputs = generate_and_post_process(
+                        model=model, 
+                        prompts=[raw_text], 
+                        tokens_to_generate=args.out_seq_length,
+                        top_k_sampling=1)
+            prompts_plus_generations = outputs[0]
+            prompts_plus_generations = prompts_plus_generations[0]
+
+            # write the generated output to the output file
+            if mpu.get_tensor_model_parallel_rank() == 0:
+                if mpu.is_pipeline_first_stage():
+
+                    generations = prompts_plus_generations[raw_text_len:]
+                    generations = generations.split("\n")[0]
+                    generations = generations.strip()
+                    fname_out.write(generations)
+                    fname_out.write("\n")
+
+            raw_text = None
+            if input_pos == input_count:
+                return
+
+
+def main():
+
+    args = get_args()
+    if args.api_prompt:
+        # obtain the generations by calling the api
+        generate_samples_by_calling_api()
+        return
+
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider, wrap_with_ddp=False)
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # perform the prompting
+    generate_samples_by_prompting_input_from_file(model)