Commit cea47848 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with main

parents af7351ef 45a8f709
......@@ -172,7 +172,7 @@ lm_eval --model openai-completions \
--tasks lambada_openai,hellaswag
```
We also support using your own local inference server with an implemented version of the OpenAI ChatCompletions endpoint and passing trained HuggingFace artifacts and tokenizers.
We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.
```bash
lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
......@@ -181,7 +181,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: |
|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
| OpenAI Completions | :heavy_check_mark: | `openai-completions` | up to `code-davinci-002` | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
......@@ -189,9 +189,12 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Your local inference server! | :heavy_check_mark: | `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface | `generate_until` | | ... |
| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface | `generate_until` | | ... |
| `local-completions` (using `openai-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions interface | `generate_until` | | ... |
It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while models that are local or APIs that supply logprobs/logits can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface).
### Other Frameworks
......
......@@ -158,11 +158,16 @@ def simple_evaluate(
)
if lm.rank == 0:
if isinstance(model, str):
model_name = model
elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
model_name = model.config._name_or_path
else:
model_name = type(model).__name__
# add info about the model and few shot config
results["config"] = {
"model": model
if isinstance(model, str)
else model.model.config._name_or_path,
"model": model_name,
"model_args": model_args,
"batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values())
......@@ -492,10 +497,13 @@ def evaluate(
]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
var_score = stderr_score**2
metric_score = results[task][metric]
if stderr_score == "N/A":
var_score = "N/A"
else:
var_score = stderr_score**2
all_stderr.append(stderr)
all_stderr.append(stderr)
metric_score = results[task][metric]
if metric in results[group]:
results[group][metric] = (
......@@ -503,15 +511,20 @@ def evaluate(
+ metric_score * current_size
) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (results[group][metric] - metric_score) ** 2
if var_score == "N/A":
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else:
results[group][metric] = metric_score
results[group][stderr] = var_score
......
......@@ -42,7 +42,7 @@ class MambaLMWrapper(HFLM):
The HFLM arguments
`backend`, `revision`, `subfolder`, `tokenizer`, `truncation`, `max_length`,
`backend`, `tokenizer`, `truncation`, `max_length`,
`device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
Are all supported by Mamba where they do not conflict
......@@ -98,7 +98,6 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
pretrained,
device=self._device,
dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
**kwargs,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
......
......@@ -2,14 +2,14 @@ import copy
import os
from collections import defaultdict
from importlib.util import find_spec
from typing import List, Optional, Tuple
from typing import List, Literal, Optional, Tuple
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions
from lm_eval.utils import eval_logger, retry_on_specific_exceptions
def get_result(response, ctxlen: int) -> Tuple[float, bool]:
......@@ -40,7 +40,7 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]:
return continuation_logprobs, is_greedy
def oa_completion(**kwargs):
def oa_completion(client, chat: bool = False, **kwargs):
"""Query OpenAI API for completion.
Retry with back-off until they respond
......@@ -64,19 +64,24 @@ def oa_completion(**kwargs):
on_exception_callback=_exception_callback,
)
def completion():
return openai.completions.create(**kwargs)
if chat:
return client.chat.completions.create(**kwargs)
else:
return client.completions.create(**kwargs)
return completion()
@register_model("openai-completions")
@register_model("openai-completions", "local-completions")
class OpenaiCompletionsLM(LM):
REQ_CHUNK_SIZE = 20
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
model: str,
base_url: str = None,
tokenizer: Optional[str] = None,
tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
truncate: bool = False,
max_gen_toks: int = 256,
batch_size: int = 1,
......@@ -101,15 +106,44 @@ class OpenaiCompletionsLM(LM):
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
)
self.model = model
self.tokenizer = tiktoken.encoding_for_model(self.model)
self.vocab_size = self.tokenizer.n_vocab
self.base_url = base_url
self.tokenizer_backend = tokenizer_backend
self.truncate = truncate
self.end_of_text_token_id = self.tokenizer.eot_token
self._batch_size = batch_size
self._max_gen_toks = max_gen_toks
self._max_length = max_length
# if we have a local model, use HF tokenizer over tiktoken
if self.tokenizer_backend == "huggingface":
import transformers # noqa: E401
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer if tokenizer else self.model
)
self.vocab_size = self.tokenizer.vocab
self.end_of_text_token_id = self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken":
if self.base_url:
eval_logger.warning(
f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
self.tokenizer = tiktoken.encoding_for_model(self.model)
self.vocab_size = self.tokenizer.n_vocab
self.end_of_text_token_id = self.tokenizer.eot_token
else:
raise ValueError(
f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
)
# Read from environment variable OPENAI_API_KEY
# Set to EMPTY for local
openai.api_key = os.environ["OPENAI_API_KEY"]
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI()
@property
def eot_token_id(self):
......@@ -127,9 +161,8 @@ class OpenaiCompletionsLM(LM):
return self._max_gen_toks
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def batch_size(self) -> int:
return self._batch_size
@property
def device(self):
......@@ -186,7 +219,7 @@ class OpenaiCompletionsLM(LM):
re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm(
list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
......@@ -203,6 +236,7 @@ class OpenaiCompletionsLM(LM):
ctxlens.append(ctxlen)
response = oa_completion(
client=self.client,
model=self.model,
prompt=inps,
echo=True,
......@@ -251,7 +285,7 @@ class OpenaiCompletionsLM(LM):
# todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
):
inps = []
self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
......@@ -265,6 +299,7 @@ class OpenaiCompletionsLM(LM):
request_args["temperature"] = request_args.get("temperature", 0)
response = oa_completion(
client=self.client,
model=self.model,
prompt=inps,
max_tokens=self.max_gen_toks,
......@@ -329,35 +364,6 @@ class OpenaiCompletionsLM(LM):
return loglikelihoods
def oa_chat_completion(client, **kwargs):
"""Query OpenAI API for chat completion.
Retry with back-off until they respond
"""
if not find_spec("openai") or not find_spec("tiktoken"):
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
"Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
)
else:
import openai
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
traceback.print_exc()
@retry_on_specific_exceptions(
on_exceptions=[openai.OpenAIError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
return client.chat.completions.create(**kwargs)
return completion()
@register_model("openai-chat-completions", "local-chat-completions")
class OpenaiChatCompletionsLM(LM):
def __init__(
......@@ -460,8 +466,12 @@ class OpenaiChatCompletionsLM(LM):
f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
)
response = oa_chat_completion(
client=self.client, messages=inps, model=self.model, **kwargs
response = oa_completion(
client=self.client,
chat=True,
messages=inps,
model=self.model,
**kwargs,
)
for resp, (context, args_) in zip(response.choices, chunk):
......
group:
- ai2_arc
task: arc_easy
dataset_path: ai2_arc
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: train
......
group: qasper
task: qasper_bool
dataset_path: qasper
dataset_path: allenai/qasper
output_type: multiple_choice
training_split: train
validation_split: validation
......
group: qasper
task: qasper_freeform
dataset_path: qasper
dataset_path: allenai/qasper
output_type: generate_until
training_split: train
validation_split: validation
......
......@@ -22,12 +22,13 @@ def load_changed_files(file_path: str) -> List[str]:
# checks the txt file for list of changed files.
# if file ends with .yaml then check yaml for task name
# if file ends with .py then parse the folder for all yaml files
# skips benchmarks folder
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
if x.endswith(".yaml") and "benchmarks" not in x:
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
elif x.endswith(".py") and "benchmarks" not in x:
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment