Fix bug in multi-token Stop Sequences (#1268)

* fix incorrect lookback protections * bump generate_until task versions

Fix bug in multi-token Stop Sequences (#1268)
* fix incorrect lookback protections * bump generate_until task versions
ff739414 · Hailey Schoelkopf · GitHub · 818c056b · ff739414 · ff739414
Unverified Commit ff739414 authored Jan 11, 2024 by Hailey Schoelkopf Committed by GitHub Jan 11, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml +1 -1

lm_eval/utils.py lm_eval/utils.py +6 -3

No files found.
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -16,4 +16,4 @@ metric_list:
    aggregation: !function metrics.agg_bleu
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -636,6 +636,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        self.done_tracker = [False] * batch_size
        self.sequence = sequence
        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
        # we look back for 2 more tokens than it takes to encode our stop sequence
        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
        # and we don't want to mistakenly not stop a generation because our
@@ -643,16 +644,18 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
        self.sequence_id_len = len(self.sequence_ids) + 2
        self.tokenizer = tokenizer
    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
-            :, -self.sequence_id_len :
-        ]
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
        for i, done in enumerate(self.done_tracker):
            if not done:
                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]