add hacks for triviaqa

3701bd08 · haileyschoelkopf · d15ee17a · 3701bd08 · 3701bd08
Commit 3701bd08 authored Jun 30, 2023 by haileyschoelkopf
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 20 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +32 -19

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +1 -1

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -290,7 +290,7 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
-    if lm.world_size > 1:
+    if lm.world_size >= 1:
        # if multigpu, then gather data across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
@@ -298,26 +298,38 @@ def evaluate(
            numitem = 0
            if type(items[0]) == tuple:
                numitem = len(items[0])
+            # strings = items[1]
-            # distributed gather requires all ranks to have same dimensions
+            # if numitem = 2:
-            # so we pad out with float32 min value
+            #     for i, string in enumerate(numitem[1]):
-            pad_value = torch.finfo(torch.float32).min
+            #         numitem[1][i] = torch.tensor(list(string.encode("ascii")))
-            metrics_tensor = torch.tensor(items, device=lm.device)
+            #         print(string, numitem[1][i])
+            print(items)
-            original_dtype = metrics_tensor.dtype  # store original dtype
+            if isinstance(items[0], str):
-            torch_device_tensor = lm.accelerator.pad_across_processes(
+                items = torch.distributed.all_gather_object(items, items)
-                metrics_tensor.to(torch.float32), pad_index=pad_value
+                print(items)
-            )
-            gathered_item = lm.accelerator.gather(torch_device_tensor)
-            if numitem > 0:
-                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
            else:
-                gathered_filtered = gathered_item[gathered_item != pad_value]
+                print(items)
+                continue
+                # items = items[0]
+                # distributed gather requires all ranks to have same dimensions
+                # so we pad out with float32 min value
+                pad_value = torch.finfo(torch.float32).min
+                metrics_tensor = torch.tensor(items, device=lm.device)
+                original_dtype = metrics_tensor.dtype  # store original dtype
+                torch_device_tensor = lm.accelerator.pad_across_processes(
+                    metrics_tensor.to(torch.float32), pad_index=pad_value
+                )
+                gathered_item = lm.accelerator.gather(torch_device_tensor)
-            gathered_item = (
+                if numitem > 0:
-                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
-            )
+                else:
+                    gathered_filtered = gathered_item[gathered_item != pad_value]
+                gathered_item = (
+                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                )
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
@@ -331,6 +343,7 @@ def evaluate(
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
            results[task_name][metric + "," + key] = task.aggregation()[metric](items)

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -233,7 +233,7 @@ class HFLM(LM):
    @property
    def max_gen_toks(self):
-        return 256
+        return 16
    @property
    def batch_size(self):