restore to original

5fd4fd28 · zihanl · fb3328fe · 5fd4fd28 · 5fd4fd28
Commit 5fd4fd28 authored Nov 21, 2021 by zihanl
Show whitespace changes
Inline Side-by-side

Showing with 0 additions and 444 deletions

megatron/text_generation_utils.py megatron/text_generation_utils.py +0 -427

tools/generate_samples_gpt.py tools/generate_samples_gpt.py +0 -17

No files found.
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -192,433 +192,6 @@ def generate_samples_input_from_file(model):
            context_count += 1


-def generate_samples_line_by_line_input_from_file(model):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Read the sample file and open the output file.
-    assert args.sample_input_file is not None, \
-        'sample input file is not provided.'
-    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        fname = open(args.sample_input_file, "r")
-        all_raw_text = fname.readlines()
-        input_count = len(all_raw_text)
-        input_pos = 0
-        if args.sample_output_file is None:
-            sample_output_file = args.sample_input_file + ".out"
-            print('`sample-output-file` not specified, setting '
-                    'it to {}'.format(sample_output_file))
-        else:
-            sample_output_file = args.sample_output_file
-
-        fname_out = open(sample_output_file, "w")
-
-    context_count = 0
-    model.eval()
-    with torch.no_grad():
-        while True:
-            raw_text_len = 0
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                raw_text = all_raw_text[input_pos]
-                input_pos += 1
-                raw_text_len = len(raw_text)
-                context_tokens = tokenizer.tokenize(raw_text)
-            
-            else:
-                context_tokens = tokenizer.tokenize("EMPTY TEXT")
-
-            if input_pos % 100 == 0:
-                print_rank_0("input_pos: %d" % input_pos)
-
-            token_stream = get_token_stream(model, [context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    trim_decode_tokens = tokenizer.detokenize(
-                        decode_tokens)[raw_text_len:]
-
-                    if "\r" in trim_decode_tokens:
-                        trim_decode_tokens = trim_decode_tokens.replace("\r", "")
-                    if "\n" in trim_decode_tokens:
-                        trim_decode_tokens = trim_decode_tokens.replace("\n", "")
-                    fname_out.write(trim_decode_tokens)
-                    fname_out.write("\n")
-
-            raw_text = None
-            context_count += 1
-
-            if input_pos == input_count:
-                return
-
-
-def generate_samples_prompt_input_from_file(model):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-    from nltk import word_tokenize
-
-    # Read the sample file and open the output file.
-    assert args.sample_input_file is not None, \
-        'sample input file is not provided.'
-    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        fname = open(args.sample_input_file, "r")
-        all_raw_text = fname.readlines()
-        input_count = len(all_raw_text)
-        input_pos = 0
-        if args.sample_output_file is None:
-            sample_output_file = args.sample_input_file + ".out"
-            print('`sample-output-file` not specified, setting '
-                    'it to {}'.format(sample_output_file))
-        else:
-            sample_output_file = args.sample_output_file
-
-        fname_out = open(sample_output_file, "w")
-
-    # Read the prompt file
-    if args.dynamic_prompt:
-        prompt_examples_dict = {}
-        with open(args.prompt_file, "r") as f:
-            for i, line in enumerate(f):
-                line = line.strip()
-                line_dict = json.loads(line)
-                key = list(line_dict.keys())[0]
-                
-                if key not in prompt_examples_dict:
-                    prompt_examples = line_dict[key]
-
-                    prompt = ""
-                    for instance in prompt_examples:
-                        instance = instance.strip()
-                        prompt += instance + " \n"
-
-                    prompt_examples_dict[key] = prompt
-
-    else:
-        with open(args.prompt_file, "r") as f:
-            prompt_examples = f.readlines()
-            prompt_examples = prompt_examples[:args.num_prompt_examples]
-
-            prompt = ""
-            for instance in prompt_examples:
-                instance = instance.strip()
-                prompt += instance + " \n"
-
-    assert args.prompt_type in ["knowledge", "knowledge_notopic", "dialogue", "dialogue_notopic"]
-    context_count = 0
-    model.eval()
-    with torch.no_grad():
-        while True:
-            raw_text_len = 0
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                input_str = all_raw_text[input_pos]
-                input_str = input_str.strip()
-                splits = input_str.split("\t")
-                control_codes = splits[0].split(" [CTRL] ")
-                topic = control_codes[0]
-
-                if args.dynamic_prompt:
-                    turns = splits[1].split(" [SEP] ")
-                    last_turn = turns[-1]
-                    key = topic + " " + last_turn
-                    raw_text = prompt_examples_dict[key]
-
-                else:
-                    raw_text = prompt
-
-                if args.prompt_type == "knowledge":
-                    turns = splits[1].split(" [SEP] ")
-                    context = turns[-1]
-                    raw_text += "( " + context + " ) " + topic + " =>"
-                    # raw_text += "( " + context + " ) " + topic + ":"
-                    # raw_text += "( " + context + " ) " + topic + " ->"
-                
-                elif args.prompt_type == "knowledge_notopic":
-                    turns = splits[1].split(" [SEP] ")[-3:]
-                    for j, turn in enumerate(turns):
-                        if j != 0:
-                            raw_text += " "
-                        else:
-                            raw_text += "( " + turn + " )"
-                    raw_text += " =>"
-                
-                elif args.prompt_type == "dialogue":
-                    turns = splits[1].split(" [SEP] ")
-                    # context = turns[-1]
-                    ctrl_sent = splits[2]
-                    ctrl_sent = " ".join(word_tokenize(ctrl_sent))
-
-                    # ## version one
-                    # turns = turns[-3:]
-                    # raw_text += "Topic: " + topic + ". "
-                    # if len(turns) == 2:
-                    #     for idx, turn in enumerate(turns):
-                    #         if idx % 2 == 0:
-                    #             raw_text += "System: " + turn + " "
-                    #         else:
-                    #             raw_text += "User: " + turn + " "
-                    # else:
-                    #     for idx, turn in enumerate(turns):
-                    #         if idx % 2 == 0:
-                    #             raw_text += "User: " + turn + " "
-                    #         else:
-                    #             raw_text += "System: " + turn + " "
-                    # raw_text += "We know that: " + ctrl_sent + " "
-                    # raw_text += "Therefore, the System will say:"
-
-                    ## version two
-                    last_turn = turns[-1]
-                    ctrl_sent = ctrl_sent.strip()
-                    last_turn = last_turn.strip()
-                    raw_text += "Topic: " + topic + ". "
-                    raw_text += "User says: " + last_turn + " "
-                    raw_text += "We know that: " + ctrl_sent + " "
-                    raw_text += "System replies:"
-
-                else:
-                    turns = splits[1].split(" [SEP] ")
-                    # context = turns[-1]
-                    ctrl_sent = splits[2]
-                    ctrl_sent = " ".join(word_tokenize(ctrl_sent))
-
-                    ## version two
-                    last_turn = turns[-1]
-                    ctrl_sent = ctrl_sent.strip()
-                    last_turn = last_turn.strip()
-                    raw_text += "User says: " + last_turn + " "
-                    raw_text += "We know that: " + ctrl_sent + " "
-                    raw_text += "System replies:"
-
-                input_pos += 1
-                raw_text_len = len(raw_text)
-                context_tokens = tokenizer.tokenize(raw_text)
-            
-            else:
-                context_tokens = tokenizer.tokenize("EMPTY TEXT")
-
-            if input_pos % 100 == 0:
-                print_rank_0("input_pos: %d" % input_pos)
-
-            token_stream = get_token_stream(model, [context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-            
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    trim_decode_tokens = tokenizer.detokenize(
-                        decode_tokens)[raw_text_len:]
-                    
-                    generated_output = trim_decode_tokens.split("\n")[0]
-                    generated_output = generated_output.strip()
-
-                    fname_out.write(generated_output)
-                    fname_out.write("\n")
-
-            raw_text = None
-            context_count += 1
-
-            if input_pos == input_count:
-                return
-
-
-def dialog_with_gpt_control_interactive(conv_model, ctrl_model, add_separtor):
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    conv_model.eval()
-    ctrl_model.eval()
-    dialog_history = []
-    with torch.no_grad():
-        while True:
-            ctrl_model_input_text_len = 0
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                # input @@ to separate the control code and current turn
-                input_text = input(">>> ")
-                while not input_text:
-                    print("Input should not be empty!")
-                    input_text = input(">>> ")
-                
-                assert " @@ " in input_text, "Please input with a correct template"
-                splits = input_text.split(" @@ ")
-                ctrl_code = splits[0]
-                curr_turn = splits[1]
-                prev_two_turns = ""
-                if add_separtor:
-                    for i, turn in enumerate(dialog_history[-2:]):
-                        if i == 0:
-                            prev_two_turns = "<< " + turn + " >>"
-                        else:
-                            prev_two_turns += " "
-                            prev_two_turns += "<< " + turn + " >>"
-                else:
-                    prev_two_turns = " ".join(dialog_history[-2:])
-                dialog_history.append(curr_turn)
-
-                print("\nHistory:", prev_two_turns)
-                print("User:", curr_turn)
-
-                if add_separtor:
-                    curr_turn = "<< " + curr_turn + " >>"
-
-                if prev_two_turns != "":
-                    dialog_context = prev_two_turns + " " + curr_turn
-                else:
-                    dialog_context = curr_turn
-                ctrl_input = ctrl_code + " " + dialog_context
-                
-                if add_separtor:
-                    ctrl_input += " :"
-
-                ctrl_input_text_len = len(ctrl_input)
-                ctrl_context_tokens = tokenizer.tokenize(ctrl_input)
-
-            else:
-                ctrl_context_tokens = tokenizer.tokenize("EMPTY TEXT")
-            
-            token_stream = get_token_stream(ctrl_model, [ctrl_context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-                    
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    control_sent = tokenizer.detokenize(
-                        decode_tokens)[ctrl_input_text_len:]
-            
-            control_sent = control_sent.replace("<|endoftext|>", "")
-            print("\nControl Sentence:", control_sent)
-            
-            if control_sent != "":
-                control_sent = "( " + control_sent + " )"
-                conv_input = control_sent + " " + dialog_context
-            else:
-                conv_input = dialog_context
-            
-            conv_input_text_len = len(conv_input)
-            
-            conv_context_tokens = tokenizer.tokenize(conv_input)
-            token_stream = get_token_stream(conv_model, [conv_context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-                    
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    response = tokenizer.detokenize(
-                        decode_tokens)[conv_input_text_len:]
-
-            response = response.replace("<|endoftext|>", "")
-            print("\nChatbot:", response)
-            dialog_history.append(response)
-
-
-def dialog_with_dpr_control_interactive(conv_model, ctrl_model, ctrl_tokenizer,
-                        knowledge_corpus, knowledge_corpus_emb, add_separtor):
-    args = get_args()
-    tokenizer = get_tokenizer()
-    
-    conv_model.eval()
-    ctrl_model.eval()
-    dialog_history = []
-    with torch.no_grad():
-        while True:
-            input_text = input(">>> ")
-            while not input_text:
-                print("Input should not be empty!")
-                input_text = input(">>> ")
-
-            assert " @@ " in input_text, "Please input with a correct template"
-            splits = input_text.split(" @@ ")
-            ctrl_code = splits[0]
-            curr_turn = splits[1]
-            prev_two_turns = " ".join(dialog_history[-2:])
-
-            prev_two_turns_v2 = ""
-            if add_separtor:
-                for i, turn in enumerate(dialog_history[-2:]):
-                    if i == 0:
-                        prev_two_turns_v2 = "<< " + turn + " >>"
-                    else:
-                        prev_two_turns_v2 += " "
-                        prev_two_turns_v2 += "<< " + turn + " >>"
-            else:
-                prev_two_turns_v2 = prev_two_turns
-            dialog_history.append(curr_turn)
-
-            print("\nHistory:", prev_two_turns_v2)
-            print("\nUser:", curr_turn)
-
-            if prev_two_turns != "":
-                dialog_context = prev_two_turns + " " + curr_turn
-            else:
-                dialog_context = curr_turn
-
-            if add_separtor:
-                curr_turn = "<< " + curr_turn + " >>"
-                dialog_context_v2 = prev_two_turns_v2 + curr_turn
-            else:
-                dialog_context_v2 = dialog_context
-
-            ctrl_input = ctrl_code + " " + dialog_context
-
-            ctrl_input_ids = ctrl_tokenizer.encode(ctrl_input)
-            ctrl_input_ids = torch.LongTensor([ctrl_input_ids]).cuda()
-            attn_masks = torch.ones(1, ctrl_input_ids.size()[-1]).cuda()
-
-            query_emb = ctrl_model(input_ids=ctrl_input_ids,
-                                   attention_mask=attn_masks).pooler_output # (1,768)
-
-            logits = knowledge_corpus_emb.matmul(query_emb[0])
-            retrieved_idx = torch.argmax(logits).item()
-            control_sent = knowledge_corpus[retrieved_idx].strip()
-            
-            print("\nControl Sentence:", control_sent)
-
-            if control_sent != "":
-                control_sent = "( " + control_sent + " )"
-                conv_input = control_sent + " " + dialog_context_v2
-            else:
-                conv_input = dialog_context_v2
-
-            conv_input_text_len = len(conv_input)
-            
-            conv_context_tokens = tokenizer.tokenize(conv_input)
-            token_stream = get_token_stream(conv_model, [conv_context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-                    
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    response = tokenizer.detokenize(
-                        decode_tokens)[conv_input_text_len:]
-
-            response = response.replace("<|endoftext|>", "")
-            print("\nChatbot:", response)
-            dialog_history.append(response)
-
-
-
 # We added this function to support the tasks evaluation such as squad
 # and drop in the https://github.com/EleutherAI/lm-evaluation-harness 
 # codebase. The lm-evaluation-harness code can now call this function

--- a/tools/generate_samples_gpt.py
+++ b/tools/generate_samples_gpt.py
@@ -72,23 +72,6 @@ def add_text_generate_args(parser):
    group.add_argument("--recompute", action='store_true',
                       help='During generation recompute all attention '
                       'instead of using previously computed keys/values.')
-    group.add_argument('--spec-toks', type=str, default=None,
-                       help='additional special tokens')
-    group.add_argument('--line-by-line', action="store_true",
-                       help='generate samples line by line')
-                       
-    group.add_argument('--prompt', action="store_true",
-                       help='generate samples based on prompting')
-    group.add_argument('--prompt-file', type=str, default="",
-                       help='prompting file')
-    group.add_argument('--prompt-type', type=str, default="",
-                       help='prompt type (context or keyphrase)')
-    group.add_argument('--num-prompt-examples', type=int, default=10,
-                       help='number of prompt examples')
-    group.add_argument("--noknowledge", action='store_true', default=False,
-                       help='Do not use knowledge in prompting')
-    group.add_argument('--dynamic-prompt', action='store_true', default=False,
-                       help='using different prompts for different test samples')

    return parser