Fix error due in Collating queries with different continuation lengths (fixes #2984) (#2987)

* FIX error due to grouping queries with different continuation length Make Collator choose query with the longest continuation as the candidate for generation * use max for key selection * added comments explaining variable cont length (identical ctx+cont[:-1]) --------- Co-authored-by: Baber <baber@hey.com>

Fix error due in Collating queries with different continuation lengths (fixes #2984) (#2987)
* FIX error due to grouping queries with different continuation length Make Collator choose query with the longest continuation as the candidate for generation * use max for key selection * added comments explaining variable cont length (identical ctx+cont[:-1]) --------- Co-authored-by: Baber <baber@hey.com>
7aaceeec · Ameya Godbole · GitHub · 357d4eaa · 7aaceeec · 7aaceeec
Unverified Commit 7aaceeec authored May 22, 2025 by Ameya Godbole Committed by GitHub May 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 4 deletions

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +7 -2

lm_eval/models/utils.py lm_eval/models/utils.py +6 -2

No files found.
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -1136,7 +1136,7 @@ class HFLM(TemplateLM):
                if self.backend == "causal":
                    total_length = len(context_enc) + len(continuation_enc)
                    if total_length > self.max_length + 1:
-                        eval_logger.warn(
+                        eval_logger.warning(
                            f"Combined length of context ({len(context_enc)}) and continuation ({len(continuation_enc)}) "
                            f"exceeds model's maximum length ({self.max_length}). "
                            f"Truncating {total_length - self.max_length + 1} tokens from the left."
@@ -1247,7 +1247,12 @@ class HFLM(TemplateLM):
                    cont_toks = torch.tensor(
                        cont_toks, dtype=torch.long, device=self.device
                    ).unsqueeze(0)  # [1, seq]
-                    max_equal = (greedy_tokens == cont_toks).all()
+                    # Use trailing slice [-cont_toks.shape[1]:] to handle variable length cont_len (but same ctx+cont[:-1]).
+                    # i.e. continuations can be sliced at diff points. Collator ensures we have sufficient greedy_tokens
+                    # by choosing key with longest cont if group_by="contexts".
+                    max_equal = (
+                        greedy_tokens[:, -cont_toks.shape[1] :] == cont_toks
+                    ).all()

                    # Obtain log-probs at the corresponding continuation token indices
                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -428,9 +428,13 @@ class Collator:
                batch = self.get_chunks(values, n=n, fn=batch_fn)
                yield from batch
        elif self._group_by == "contexts":
-            # Get one sample from each key
+            # Get one sample from each key.
+            # Select longest continuation per group to ensure sufficient context logits
            values = self._reorder(
-                [value[0] for value in self._arr_with_indices.values()]
+                [
+                    max(value, key=lambda x: len(x[1][-1]))
+                    for value in self._arr_with_indices.values()
+                ]
            )
            batch = self.get_chunks(values, n=n, fn=batch_fn)
            yield from batch