add image hashing and `LMEVAL_HASHMM` envar (#2973)

* add image hashing * remove unused params decription * use `LMEVAL_HASHMM` (defualt '1') to save raw images --------- Co-authored-by: Baber <baber@hey.com>

add image hashing and `LMEVAL_HASHMM` envar (#2973)
* add image hashing * remove unused params decription * use `LMEVAL_HASHMM` (defualt '1') to save raw images --------- Co-authored-by: Baber <baber@hey.com>
e69ca5ed · achervyakov · GitHub · 0e96cd18 · e69ca5ed · e69ca5ed
Unverified Commit e69ca5ed authored Jul 05, 2025 by achervyakov Committed by GitHub Jul 05, 2025
Show whitespace changes
Inline Side-by-side

Showing with 59 additions and 1 deletion

lm_eval/evaluator.py lm_eval/evaluator.py +8 -1

lm_eval/utils.py lm_eval/utils.py +51 -0

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import itertools
 import json
 import logging
+import os
 import random
 import time
 from collections import defaultdict
@@ -29,6 +30,7 @@ from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_comm
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
    handle_non_serializable,
+    hash_dict_images,
    hash_string,
    positional_deprecated,
    setup_logging,
@@ -140,7 +142,6 @@ def simple_evaluate(
        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
    :param metadata: dict
        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
    return
        Dictionary of results
    """
@@ -747,6 +748,12 @@ def evaluate(
            },
        }
        if log_samples:
+            # default: hash images
+            samples = (
+                hash_dict_images(samples)
+                if os.environ.get("LMEVAL_HASHMM", "1") != "0"
+                else samples
+            )
            results_dict["samples"] = dict(samples)
        return results_dict

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -550,3 +550,54 @@ def weighted_f1_score(items):
    preds = unzipped_list[1]
    fscore = f1_score(golds, preds, average="weighted")
    return fscore
+def convert_pil_to_hash(value):
+    from io import BytesIO
+    img_bytes = BytesIO()
+    value.save(img_bytes, format="PNG")
+    return hashlib.sha256(str(img_bytes).encode()).hexdigest()
+def convert_bytes_to_hash(value):
+    return hashlib.sha256(str(value).encode()).hexdigest()
+def hash_dict_images(data_dict):
+    """
+    Create a deep copy of `data_dict` where all bytes and PIL.Image.Image values
+    are replaced by their respective hashes using the provided converter functions.
+    Parameters:
+        data_dict (dict): The input dictionary with arbitrary nesting of dicts and lists.
+    Returns:
+        dict: A new dictionary with the same structure as `data_dict`, but with all
+              bytes and PIL.Image.Image objects replaced by their hashes.
+    """
+    from PIL import Image
+    def _process_value(value):
+        # Bytes -> hash
+        if isinstance(value, (bytes, bytearray)):
+            return convert_bytes_to_hash(value)
+        # PIL Image -> hash
+        if isinstance(value, Image.Image):
+            return convert_pil_to_hash(value)
+        # Nested dictionary -> recurse
+        if isinstance(value, dict):
+            return {k: _process_value(v) for k, v in value.items()}
+        # List or tuple -> recurse, preserving type
+        if isinstance(value, list):
+            return [_process_value(v) for v in value]
+        if isinstance(value, tuple):
+            return tuple(_process_value(v) for v in value)
+        # Other types remain unchanged
+        return value
+    # Ensure the top-level is a dict
+    if not isinstance(data_dict, dict):
+        raise TypeError("Input must be a dictionary")
+    return {key: _process_value(val) for key, val in data_dict.items()}