set `--gen_kwargs` arg to None (#1145)

* set `--gen_kwargs` to None + add help to CLI * add logging metavar * fix verbosity help messages * Reorder severity levels.

set `--gen_kwargs` arg to None (#1145)
* set `--gen_kwargs` to None + add help to CLI * add logging metavar * fix verbosity help messages * Reorder severity levels.
08fcf1fe · Baber Abbasi · GitHub · aa61f940 · 08fcf1fe · 08fcf1fe
Unverified Commit 08fcf1fe authored Dec 18, 2023 by Baber Abbasi Committed by GitHub Dec 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 14 deletions

README.md README.md +2 -2

lm_eval/__main__.py lm_eval/__main__.py +25 -12

No files found.
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ lm_eval --model hf \
    --batch_size auto:4
 ```

-Alternatively, you can use `lm-eval` instead of `lm_eval`.
+The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.

 > [!Note]
 > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
@@ -140,7 +140,7 @@ lm_eval --model vllm \
 ```
 For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.

-vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a script at [./scripts/model_comparator.py](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/scripts/model_comparator.py) for checking validity of vllm results against HF.
+vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.

 ### Model APIs and Inference Servers


--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -29,43 +29,53 @@ def parse_eval_args() -> argparse.Namespace:
    parser.add_argument(
        "--tasks",
        default=None,
+        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    parser.add_argument(
        "--model_args",
        default="",
-        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
+        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
    parser.add_argument(
        "--num_fewshot",
        type=int,
        default=None,
+        metavar="N",
        help="Number of examples in few-shot context",
    )
-    parser.add_argument("--batch_size", type=str, default=1)
+    parser.add_argument(
+        "--batch_size",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
-        help="Maximal batch size to try with --batch_size auto",
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default=None,
-        help="Device to use (e.g. cuda, cuda:0, cpu)",
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
    )
    parser.add_argument(
        "--output_path",
        default=None,
        type=str,
-        metavar="= [dir/file.jsonl] [DIR]",
+        metavar="DIR|DIR/file.json",
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    parser.add_argument(
        "--limit",
        type=float,
        default=None,
+        metavar="N|0<N<1",
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
@@ -73,25 +83,26 @@ def parse_eval_args() -> argparse.Namespace:
        "--use_cache",
        type=str,
        default=None,
+        metavar="DIR",
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
    parser.add_argument(
        "--check_integrity",
        action="store_true",
-        help="Whether to run the relevant part of the test suite for the tasks",
+        help="Whether to run the relevant part of the test suite for the tasks.",
    )
    parser.add_argument(
        "--write_out",
        action="store_true",
        default=False,
-        help="Prints the prompt for the first few documents",
+        help="Prints the prompt for the first few documents.",
    )
    parser.add_argument(
        "--log_samples",
        action="store_true",
        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
    parser.add_argument(
        "--show_config",
@@ -103,21 +114,23 @@ def parse_eval_args() -> argparse.Namespace:
        "--include_path",
        type=str,
        default=None,
+        metavar="DIR",
        help="Additional path to include if there are external tasks to include.",
    )
    parser.add_argument(
        "--gen_kwargs",
-        default="",
+        default=None,
        help=(
            "String arguments for model generation on greedy_until tasks,"
-            " e.g. `temperature=0,top_k=0,top_p=0`"
+            " e.g. `temperature=0,top_k=0,top_p=0`."
        ),
    )
    parser.add_argument(
        "--verbosity",
        type=str,
        default="INFO",
-        help="Log error when tasks are not registered.",
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
    )
    return parser.parse_args()

@@ -179,7 +192,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                )
                raise ValueError(
-                    f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
                )

    if args.output_path: