Generation: only search for eos_token if set (#22875)

Generation: only check for eos_token if set The check for unfinished_sequences.max(), which is to find sequences that have ended early via eos_token_id, creates a synchronization point even when there is no eos_token, which slows inference down. This change moves the calculation to inside the condition checking for eos_token, so that such slowdown may be removed by disabling this token. Co-authored-by: John Doe <john.doe@example.com>

Generation: only search for eos_token if set (#22875)
Generation: only check for eos_token if set The check for unfinished_sequences.max(), which is to find sequences that have ended early via eos_token_id, creates a synchronization point even when there is no eos_token, which slows inference down. This change moves the calculation to inside the condition checking for eos_token, so that such slowdown may be removed by disabling this token. Co-authored-by: John Doe <john.doe@example.com>
d50db469 · xloem · GitHub · a438a094 · d50db469
Unverified Commit d50db469 authored Apr 20, 2023 by xloem Committed by GitHub Apr 20, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 20 deletions

src/transformers/generation/utils.py src/transformers/generation/utils.py +36 -20

No files found.
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -2120,13 +2120,17 @@ class GenerationMixin:
                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
                )

-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
        if streamer is not None:
            streamer.end()

@@ -2375,13 +2379,17 @@ class GenerationMixin:
                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
                )

-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
        if streamer is not None:
            streamer.end()

@@ -2653,13 +2661,17 @@ class GenerationMixin:
                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
                )

-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
        if streamer is not None:
            streamer.end()

@@ -4418,13 +4430,17 @@ class GenerationMixin:
                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
                )

-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                if not synced_gpus:
-                    break
-                else:
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                    this_peer_finished = True

+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+
+            if this_peer_finished and not synced_gpus:
+                break
+
        if streamer is not None:
            streamer.end()