Unverified Commit 08fcf1fe authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

set `--gen_kwargs` arg to None (#1145)

* set `--gen_kwargs` to None + add help to CLI

* add logging metavar

* fix verbosity help messages

* Reorder severity levels.
parent aa61f940
......@@ -97,7 +97,7 @@ lm_eval --model hf \
--batch_size auto:4
```
Alternatively, you can use `lm-eval` instead of `lm_eval`.
The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
> [!Note]
> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
......@@ -140,7 +140,7 @@ lm_eval --model vllm \
```
For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a script at [./scripts/model_comparator.py](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/scripts/model_comparator.py) for checking validity of vllm results against HF.
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
### Model APIs and Inference Servers
......
......@@ -29,43 +29,53 @@ def parse_eval_args() -> argparse.Namespace:
parser.add_argument(
"--tasks",
default=None,
metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list",
)
parser.add_argument(
"--model_args",
default="",
help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
)
parser.add_argument(
"--num_fewshot",
type=int,
default=None,
metavar="N",
help="Number of examples in few-shot context",
)
parser.add_argument("--batch_size", type=str, default=1)
parser.add_argument(
"--batch_size",
type=str,
default=1,
metavar="auto|auto:N|N",
help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
)
parser.add_argument(
"--max_batch_size",
type=int,
default=None,
help="Maximal batch size to try with --batch_size auto",
metavar="N",
help="Maximal batch size to try with --batch_size auto.",
)
parser.add_argument(
"--device",
type=str,
default=None,
help="Device to use (e.g. cuda, cuda:0, cpu)",
help="Device to use (e.g. cuda, cuda:0, cpu).",
)
parser.add_argument(
"--output_path",
default=None,
type=str,
metavar="= [dir/file.jsonl] [DIR]",
metavar="DIR|DIR/file.json",
help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
)
parser.add_argument(
"--limit",
type=float,
default=None,
metavar="N|0<N<1",
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.",
)
......@@ -73,25 +83,26 @@ def parse_eval_args() -> argparse.Namespace:
"--use_cache",
type=str,
default=None,
metavar="DIR",
help="A path to a sqlite db file for caching model responses. `None` if not caching.",
)
parser.add_argument("--decontamination_ngrams_path", default=None) # TODO: not used
parser.add_argument(
"--check_integrity",
action="store_true",
help="Whether to run the relevant part of the test suite for the tasks",
help="Whether to run the relevant part of the test suite for the tasks.",
)
parser.add_argument(
"--write_out",
action="store_true",
default=False,
help="Prints the prompt for the first few documents",
help="Prints the prompt for the first few documents.",
)
parser.add_argument(
"--log_samples",
action="store_true",
default=False,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
)
parser.add_argument(
"--show_config",
......@@ -103,21 +114,23 @@ def parse_eval_args() -> argparse.Namespace:
"--include_path",
type=str,
default=None,
metavar="DIR",
help="Additional path to include if there are external tasks to include.",
)
parser.add_argument(
"--gen_kwargs",
default="",
default=None,
help=(
"String arguments for model generation on greedy_until tasks,"
" e.g. `temperature=0,top_k=0,top_p=0`"
" e.g. `temperature=0,top_k=0,top_p=0`."
),
)
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
)
return parser.parse_args()
......@@ -179,7 +192,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
)
if args.output_path:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment