add hacks for triviaqa

3701bd08 · haileyschoelkopf · d15ee17a · 3701bd08 · 3701bd08
Commit 3701bd08 authored Jun 30, 2023 by haileyschoelkopf
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 20 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +32 -19

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +1 -1

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -290,7 +290,7 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
-    if lm.world_size > 1:
+    if lm.world_size >= 1:
        # if multigpu, then gather data across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
@@ -298,7 +298,19 @@ def evaluate(
            numitem = 0
            if type(items[0]) == tuple:
                numitem = len(items[0])
+            # strings = items[1]
+            # if numitem = 2:
+            #     for i, string in enumerate(numitem[1]):
+            #         numitem[1][i] = torch.tensor(list(string.encode("ascii")))
+            #         print(string, numitem[1][i])
+            print(items)
+            if isinstance(items[0], str):
+                items = torch.distributed.all_gather_object(items, items)
+                print(items)
+            else:
+                print(items)
+                continue
+                # items = items[0]
                # distributed gather requires all ranks to have same dimensions
                # so we pad out with float32 min value
                pad_value = torch.finfo(torch.float32).min
@@ -331,6 +343,7 @@ def evaluate(
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
            results[task_name][metric + "," + key] = task.aggregation()[metric](items)

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -233,7 +233,7 @@ class HFLM(LM):
    @property
    def max_gen_toks(self):
-        return 256
+        return 16
    @property
    def batch_size(self):