Output path fix (#2993)

* fix(output_path): support direct JSON file paths * fix linting * turn off external Lm tests for now * Update help text for `output_path` --------- Co-authored-by: Baber <baber@hey.com>

Output path fix (#2993)
* fix(output_path): support direct JSON file paths * fix linting * turn off external Lm tests for now * Update help text for `output_path` --------- Co-authored-by: Baber <baber@hey.com>
178fa84d · Niccolò Ajroldi · GitHub · 8be417a8 · 178fa84d · 178fa84d
Unverified Commit 178fa84d authored May 21, 2025 by Niccolò Ajroldi Committed by GitHub May 21, 2025
Showing with 56 additions and 43 deletions

.github/workflows/unit_tests.yml .github/workflows/unit_tests.yml +33 -33

lm_eval/__main__.py lm_eval/__main__.py +1 -1

lm_eval/loggers/evaluation_tracker.py lm_eval/loggers/evaluation_tracker.py +22 -9

No files found.
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -79,36 +79,36 @@ jobs:
          path: |
            test_logs/*

-  testmodels:
-    name: External LM Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python 3.9
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-          cache: pip
-          cache-dependency-path: pyproject.toml
-
-      # Cache HuggingFace cache directory for External LM tests
-      - name: Cache HuggingFace cache (External LM tests)
-        uses: actions/cache@v3
-        id: cache-hf-lm
-        with:
-          path: ~/.cache/huggingface
-          key: ${{ runner.os }}-hf-cache-external-lm
-          restore-keys: |
-            ${{ runner.os }}-hf-cache-external-lm
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install -U transformers peft accelerate
-
-      - name: Test with pytest
-        run: python -m pytest tests/models --showlocals -s -vv
-        continue-on-error: true  # Continue workflow even if tests fail
+#  testmodels:
+#    name: External LM Tests
+#    runs-on: ubuntu-latest
+#    timeout-minutes: 30
+#    steps:
+#      - name: Checkout Code
+#        uses: actions/checkout@v4
+#      - name: Set up Python 3.9
+#        uses: actions/setup-python@v5
+#        with:
+#          python-version: 3.9
+#          cache: pip
+#          cache-dependency-path: pyproject.toml
+#
+#      # Cache HuggingFace cache directory for External LM tests
+#      - name: Cache HuggingFace cache (External LM tests)
+#        uses: actions/cache@v3
+#        id: cache-hf-lm
+#        with:
+#          path: ~/.cache/huggingface
+#          key: ${{ runner.os }}-hf-cache-external-lm
+#          restore-keys: |
+#            ${{ runner.os }}-hf-cache-external-lm
+#
+#      - name: Install dependencies
+#        run: |
+#          python -m pip install --upgrade pip
+#          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+#          pip install -U transformers peft accelerate
+#
+#      - name: Test with pytest
+#        run: python -m pytest tests/models --showlocals -s -vv
+#        continue-on-error: true  # Continue workflow even if tests fail
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -135,7 +135,7 @@ def setup_parser() -> argparse.ArgumentParser:
        default=None,
        type=str,
        metavar="DIR|DIR/file.json",
-        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    parser.add_argument(
        "--limit",

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -229,11 +229,21 @@ class EvaluationTracker:
                )

                path = Path(self.output_path if self.output_path else Path.cwd())
-                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
-                path.mkdir(parents=True, exist_ok=True)
-
                self.date_id = datetime.now().isoformat().replace(":", "-")
-                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                if path.suffix == ".json":
+                    path.parent.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.with_name(
+                        f"{path.stem}_{self.date_id}.json"
+                    )
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
+                    path.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.joinpath(
+                        f"results_{self.date_id}.json"
+                    )
+
                file_results_aggregated.open("w", encoding="utf-8").write(dumped)

                if self.api and self.push_results_to_hub:
@@ -250,12 +260,10 @@ class EvaluationTracker:
                    )
                    self.api.upload_file(
                        repo_id=repo_id,
-                        path_or_fileobj=str(
-                            path.joinpath(f"results_{self.date_id}.json")
-                        ),
+                        path_or_fileobj=str(file_results_aggregated),
                        path_in_repo=os.path.join(
                            self.general_config_tracker.model_name,
-                            f"results_{self.date_id}.json",
+                            file_results_aggregated.name,
                        ),
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
@@ -290,7 +298,12 @@ class EvaluationTracker:
                eval_logger.info(f"Saving per-sample results for: {task_name}")

                path = Path(self.output_path if self.output_path else Path.cwd())
-                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                if path.suffix == ".json":
+                    path = path.parent
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
                path.mkdir(parents=True, exist_ok=True)

                file_results_samples = path.joinpath(