Unverified Commit ce9ba47e authored by Kiersten Stokes's avatar Kiersten Stokes Committed by GitHub
Browse files

Clean up README and pyproject.toml (#2814)

parent 7123c6a5
...@@ -326,7 +326,7 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt- ...@@ -326,7 +326,7 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-
Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
| API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: | | API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: |
|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------| | --------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | | OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | | Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
...@@ -336,13 +336,14 @@ Note that for externally hosted models, configs such as `--device` which relate ...@@ -336,13 +336,14 @@ Note that for externally hosted models, configs such as `--device` which relate
| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | | [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | Huggingface Optimum (Causal LMs) | :heavy_check_mark: | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum-intel IPEX (Causal LMs) | ✔️ | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | Huggingface Optimum-intel IPEX (Causal LMs) | :heavy_check_mark: | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | Neuron via AWS Inf2 (Causal LMs) | :heavy_check_mark: | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | ✔️ | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | ... | | [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | :heavy_check_mark: | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` |
| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | ✔️ | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | :heavy_check_mark: | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| NVIDIA NeMo | :heavy_check_mark: | `nemo_lm` | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` | | Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` |
| [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | ... | | [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
...@@ -552,30 +553,35 @@ The best way to get support is to open an issue on this repo or join the [Eleuth ...@@ -552,30 +553,35 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
Extras dependencies can be installed via `pip install -e ".[NAME]"` Extras dependencies can be installed via `pip install -e ".[NAME]"`
| Name | Use | | Name | Use |
|-----------------|----------------------------------------------| | -------------------- | ----------------------------------------------------- |
| api | For using api models (Anthropic, OpenAI API) | | api | For using api models (Anthropic, OpenAI API) |
| audiolm_qwen | For running Qwen2 audio models |
| deepsparse | For running NM's DeepSparse models | | deepsparse | For running NM's DeepSparse models |
| dev | For linting PRs and contributions | | dev | For linting PRs and contributions |
| gptq | For loading models with AutoGPTQ | | gptq | For loading models with AutoGPTQ |
| gptqmodel | For loading models with GPTQModel | | gptqmodel | For loading models with GPTQModel |
| hf_transfer | For speeding up HF Hub file downloads | | hf_transfer | For speeding up HF Hub file downloads |
| ifeval | For running the IFEval task |
| ibm_watsonx_ai | For using IBM watsonx.ai model apis | | ibm_watsonx_ai | For using IBM watsonx.ai model apis |
| ifeval | For running the IFEval task |
| ipex | For running on optimum-intel ipex backend | | ipex | For running on optimum-intel ipex backend |
| neuronx | For running on AWS inf2 instances | | japanese_leaderboard | For running Japanese LLM Leaderboard tasks |
| longbench | For running LongBench tasks |
| mamba | For loading Mamba SSM models | | mamba | For loading Mamba SSM models |
| math | For running math task answer checking | | math | For running math task answer checking |
| multilingual | For multilingual tokenizers | | multilingual | For multilingual tokenizers |
| neuronx | For running on AWS inf2 instances |
| optimum | For running Intel OpenVINO models | | optimum | For running Intel OpenVINO models |
| promptsource | For using PromptSource prompts | | promptsource | For using PromptSource prompts |
| ruler | For running RULER tasks |
| sae_lens | For using SAELens to steer models | | sae_lens | For using SAELens to steer models |
| sentencepiece | For using the sentencepiece tokenizer | | sentencepiece | For using the sentencepiece tokenizer |
| sparseml | For using NM's SparseML models | | sparseml | For using NM's SparseML models |
| sparsify | For using Sparsify to steer models | | sparsify | For using Sparsify to steer models |
| testing | For running library test suite | | testing | For running library test suite |
| vllm | For loading models with vLLM | | vllm | For loading models with vLLM |
| wandb | For integration with `Weights and Biases` platform |
| zeno | For visualizing results with Zeno | | zeno | For visualizing results with Zeno |
| --------------- | --------------------------------------- | | -------------------- | ----------------------------------------------------- |
| all | Loads all extras (not recommended) | | all | Loads all extras (not recommended) |
## Cite as ## Cite as
......
...@@ -59,54 +59,59 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness" ...@@ -59,54 +59,59 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies] [project.optional-dependencies]
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"] api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
audiolm_qwen = ["librosa", "soundfile"] audiolm_qwen = ["librosa", "soundfile"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"] deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt"]
gptq = ["auto-gptq[triton]>=0.6.0"] gptq = ["auto-gptq[triton]>=0.6.0"]
gptqmodel = ["gptqmodel>=1.0.9"]
hf_transfer = ["hf_transfer"] hf_transfer = ["hf_transfer"]
ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22", "python-dotenv"] ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22", "python-dotenv"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench=["jeiba", "fuzzywuzzy", "rouge"] longbench=["jeiba", "fuzzywuzzy", "rouge"]
neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
neuronx = ["optimum[neuronx]"]
optimum = ["optimum[openvino]"] optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"] promptsource = ["promptsource>=0.2.3"]
ruler = ["nltk", "wonderwords", "scipy"] ruler = ["nltk", "wonderwords", "scipy"]
sae_lens = ["sae_lens"] sae_lens = ["sae_lens"]
sentencepiece = ["sentencepiece>=0.1.98"] sentencepiece = ["sentencepiece>=0.1.98"]
sparsify = ["sparsify"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"] sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify = ["sparsify"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm>=0.4.2"] vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"] wandb = ["wandb>=0.16.3", "pandas", "numpy"]
gptqmodel = ["gptqmodel>=1.0.9"] zeno = ["pandas", "zeno-client"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
all = [ all = [
"lm_eval[anthropic]", "lm_eval[api]",
"lm_eval[dev]", "lm_eval[audiolm_qwen]",
"lm_eval[deepsparse]", "lm_eval[deepsparse]",
"lm_eval[dev]",
"lm_eval[gptq]", "lm_eval[gptq]",
"lm_eval[gptqmodel]",
"lm_eval[hf_transfer]", "lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]", "lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[ipex]",
"lm_eval[japanese_leaderboard]",
"lm_eval[longbench]", "lm_eval[longbench]",
"lm_eval[mamba]", "lm_eval[mamba]",
"lm_eval[math]", "lm_eval[math]",
"lm_eval[multilingual]", "lm_eval[multilingual]",
"lm_eval[openai]", "lm_eval[neuronx]",
"lm_eval[optimum]",
"lm_eval[promptsource]", "lm_eval[promptsource]",
"lm_eval[ruler]", "lm_eval[ruler]",
"lm_eval[sae_lens]", "lm_eval[sae_lens]",
"lm_eval[sentencepiece]", "lm_eval[sentencepiece]",
"lm_eval[sparsify]",
"lm_eval[sparseml]", "lm_eval[sparseml]",
"lm_eval[sparsify]",
"lm_eval[testing]", "lm_eval[testing]",
"lm_eval[vllm]", "lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]", "lm_eval[wandb]",
"lm_eval[japanese_leaderboard]", "lm_eval[zeno]",
] ]
[tool.ruff.lint] [tool.ruff.lint]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment