Removing unnecessary --recompute path

453414da · rprenger · f7fe3865 · 453414da · 453414da
Commit 453414da authored Jun 30, 2021 by rprenger
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 36 deletions

megatron/text_generation_utils.py megatron/text_generation_utils.py +22 -32

tools/run_api_server.py tools/run_api_server.py +0 -4

No files found.
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -189,40 +189,30 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
        lengths = torch.ones([batch_size]).long().cuda() * maxlen

        while context_length <= (maxlen):
-            if args.recompute:
-                output = forward_step(model, tokens,
-                                      position_ids,
-                                      attention_mask,
-                                      tokentype_ids=type_ids,
-                                      forward_method_parallel_output=False)
-                if mpu.is_pipeline_last_stage():
-                    assert output is not None
-                    logits = output[:, context_length - 1, :]
+            types2use = None
+            if counter == 0:
+                tokens2use = tokens[:, :context_length]
+                positions2use = position_ids[:, :context_length]
+                if type_ids is not None:
+                    types2use = type_ids[:, :context_length]
            else:
-                types2use = None
-                if counter == 0:
-                    tokens2use = tokens[:, :context_length]
-                    positions2use = position_ids[:, :context_length]
-                    if type_ids is not None:
-                        types2use = type_ids[:, :context_length]
-                else:
-                    tokens2use = tokens[:, context_length - 1].view(
-                        batch_size, -1)
-                    positions2use = position_ids[:, context_length - 1].view(
+                tokens2use = tokens[:, context_length - 1].view(
+                    batch_size, -1)
+                positions2use = position_ids[:, context_length - 1].view(
+                    batch_size, -1)
+                if type_ids is not None:
+                    types2use = type_ids[:, context_length - 1].view(
                        batch_size, -1)
-                    if type_ids is not None:
-                        types2use = type_ids[:, context_length - 1].view(
-                            batch_size, -1)
-                output, layer_past = forward_step(model, tokens2use,
-                                                  positions2use,
-                                                  attention_mask,
-                                                  layer_past=layer_past,
-                                                  get_key_value=True,
-                                                  tokentype_ids=types2use,
-                                                  forward_method_parallel_output=False)
-                if mpu.is_pipeline_last_stage():
-                    assert output is not None
-                    logits = output[:, -1].view(batch_size, -1).contiguous()
+            output, layer_past = forward_step(model, tokens2use,
+                                              positions2use,
+                                              attention_mask,
+                                              layer_past=layer_past,
+                                              get_key_value=True,
+                                              tokentype_ids=types2use,
+                                              forward_method_parallel_output=False)
+            if mpu.is_pipeline_last_stage():
+                assert output is not None
+                logits = output[:, -1].view(batch_size, -1).contiguous()

            if mpu.is_pipeline_last_stage():
                if args.greedy:

--- a/tools/run_api_server.py
+++ b/tools/run_api_server.py
@@ -55,10 +55,6 @@ def add_text_generate_args(parser):
                       help='Top k sampling.')
    group.add_argument("--out-seq-length", type=int, default=1024,
                       help='Size of the output generated text.')
-    group.add_argument("--recompute", action='store_true',
-                       help='During generation recompute all attention '
-                       'instead of using previously computed keys/values.')
-
    return parser