Commit 60c9c170 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into inverse-scaling-tasks

parents 4b2d565b b4cd85d4
from .evaluation_tracker import EvaluationTracker
from .wandb_logger import WandbLogger
import json
import re
import time
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
)
@dataclass(init=False)
class GeneralConfigTracker:
"""
Tracker for the evaluation parameters.
Attributes:
model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
model_name (str): Name of the model.
model_name_sanitized (str): Sanitized model name for directory creation.
start_time (float): Start time of the experiment. Logged at class init.
end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
"""
model_source: str = None
model_name: str = None
model_name_sanitized: str = None
start_time: float = None
end_time: float = None
total_evaluation_time_seconds: str = None
def __init__(self) -> None:
"""Starts the evaluation timer."""
self.start_time = time.perf_counter()
@staticmethod
def _get_model_name(model_args: str) -> str:
"""Extracts the model name from the model arguments."""
def extract_model_name(model_args: str, key: str) -> str:
"""Extracts the model name from the model arguments using a key."""
args_after_key = model_args.split(key)[1]
return args_after_key.split(",")[0]
# order does matter, e.g. peft and delta are provided together with pretrained
prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
for prefix in prefixes:
if prefix in model_args:
return extract_model_name(model_args, prefix)
return ""
def log_experiment_args(
self,
model_source: str,
model_args: str,
) -> None:
"""Logs model parameters and job ID."""
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
)
def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time."""
self.end_time = time.perf_counter()
self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
class EvaluationTracker:
"""
Keeps track and saves relevant information of the evaluation process.
Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
"""
def __init__(
self,
output_path: str = None,
hub_results_org: str = "",
hub_repo_name: str = "",
push_results_to_hub: bool = False,
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
"""
self.general_config_tracker = GeneralConfigTracker()
self.output_path = output_path
self.hub_results_org = hub_results_org
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
self.push_results_to_hub = push_results_to_hub
self.push_samples_to_hub = push_samples_to_hub
self.public_repo = public_repo
self.api = HfApi(token=token) if token else None
def save_results_aggregated(
self,
results: dict,
samples: dict,
) -> None:
"""
Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
Args:
results (dict): The aggregated results to save.
samples (dict): The samples results to save.
"""
self.general_config_tracker.log_end_time()
if self.output_path:
try:
eval_logger.info("Saving results aggregated")
# calculate cumulative hash for each task - only if samples are provided
task_hashes = {}
if samples:
for task_name, task_samples in samples.items():
sample_hashes = [
s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
for s in task_samples
]
task_hashes[task_name] = hash_string("".join(sample_hashes))
# update initial results dict
results.update({"task_hashes": task_hashes})
results.update(asdict(self.general_config_tracker))
dumped = json.dumps(
results,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
self.date_id = datetime.now().isoformat().replace(":", "-")
file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
self.api.create_repo(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save results aggregated")
eval_logger.info(repr(e))
else:
eval_logger.info(
"Output path not provided, skipping saving results aggregated"
)
def save_results_samples(
self,
task_name: str,
samples: dict,
) -> None:
"""
Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
Args:
task_name (str): The task name to save the samples for.
samples (dict): The samples results to save.
"""
if self.output_path:
try:
eval_logger.info("Saving samples results")
samples_dumped = json.dumps(
samples,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.json"
)
file_results_samples.write_text(samples_dumped, encoding="utf-8")
if self.api and self.push_samples_to_hub:
self.api.create_repo(
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save sample results")
eval_logger.info(repr(e))
else:
eval_logger.info("Output path not provided, skipping saving sample results")
import logging
import os
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
logger = logging.getLogger(__name__)
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed
removed = result != input_string
return result, removed
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = (
Path(git_folder, "HEAD")
.read_text(encoding="utf-8")
.split("\n")[0]
.split(" ")[-1]
)
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
except Exception as err:
logger.debug(
f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
)
return None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(
Path(os.getcwd(), "..")
) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
import copy import copy
import json import json
import logging import logging
import os from typing import Any, Dict, List, Literal, Tuple
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from packaging.version import Version from packaging.version import Version
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
logger = logging.getLogger(__name__)
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed logger = logging.getLogger(__name__)
removed = result != input_string
return result, removed
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def get_wandb_printer() -> Literal["Printer"]: def get_wandb_printer() -> Literal["Printer"]:
...@@ -395,55 +350,3 @@ class WandbLogger: ...@@ -395,55 +350,3 @@ class WandbLogger:
self._log_samples_as_artifact(eval_preds, task_name) self._log_samples_as_artifact(eval_preds, task_name)
self.run.log({f"{group}_eval_results": grouped_df}) self.run.log({f"{group}_eval_results": grouped_df})
def get_commit_from_path(repo_path: Path) -> Optional[str]:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = (
Path(git_folder, "HEAD")
.read_text(encoding="utf-8")
.split("\n")[0]
.split(" ")[-1]
)
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(
Path(os.getcwd(), "..")
) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
...@@ -4,6 +4,8 @@ from . import ( ...@@ -4,6 +4,8 @@ from . import (
gguf, gguf,
huggingface, huggingface,
mamba_lm, mamba_lm,
nemo_lm,
neuralmagic,
neuron_optimum, neuron_optimum,
openai_completions, openai_completions,
optimum_lm, optimum_lm,
......
...@@ -45,7 +45,7 @@ def anthropic_completion( ...@@ -45,7 +45,7 @@ def anthropic_completion(
except ModuleNotFoundError: except ModuleNotFoundError:
raise Exception( raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
def _exception_callback(e: Exception, sleep_time: float) -> None: def _exception_callback(e: Exception, sleep_time: float) -> None:
...@@ -74,6 +74,70 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -74,6 +74,70 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
return completion() return completion()
def anthropic_chat(
client, #: anthropic.Anthropic,
model: str,
prompt: str,
max_tokens: int,
temperature: float,
stop: List[str],
**kwargs: Any,
) -> str:
"""Wrapper function around the Anthropic completion API client with exponential back-off
in case of RateLimitError.
params:
client: anthropic.Anthropic
Anthropic API client
model: str
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
prompt: str
Prompt to feed to the model
max_tokens: int
Maximum number of tokens to sample from the model
temperature: float
Sampling temperature
stop: List[str]
List of stop sequences
kwargs: Any
Additional model_args to pass to the API client
"""
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
def _exception_callback(e: Exception, sleep_time: float) -> None:
eval_logger.warning(
f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
)
@retry_on_specific_exceptions(
on_exceptions=[
anthropic.RateLimitError,
anthropic.APIConnectionError,
anthropic.APIStatusError,
],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def messages():
response = client.messages.create(
model=model,
max_tokens=max_tokens,
temperature=temperature,
messages=[{"role": "user", "content": f"{prompt}"}],
**kwargs,
)
return response.content[0].text
return messages()
@register_model("anthropic") @register_model("anthropic")
class AnthropicLM(LM): class AnthropicLM(LM):
REQ_CHUNK_SIZE = 20 # TODO: not used REQ_CHUNK_SIZE = 20 # TODO: not used
...@@ -104,7 +168,7 @@ class AnthropicLM(LM): ...@@ -104,7 +168,7 @@ class AnthropicLM(LM):
except ModuleNotFoundError: except ModuleNotFoundError:
raise Exception( raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
self.model = model self.model = model
...@@ -153,7 +217,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -153,7 +217,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
except ModuleNotFoundError: except ModuleNotFoundError:
raise Exception( raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
if not requests: if not requests:
...@@ -204,3 +268,93 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -204,3 +268,93 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
@register_model("anthropic-chat", "anthropic-chat-completions")
class AnthropicChatLM(AnthropicLM):
REQ_CHUNK_SIZE = 20 # TODO: not used
def __init__(
self,
model: str,
batch_size: int = 1,
max_tokens: int = 256,
temperature: float = 0, # defaults to 1
**kwargs, # top_p, top_k, etc.
) -> None:
"""Anthropic API wrapper.
:param model: str
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
:param max_tokens: int
Maximum number of tokens to sample from the model
:param temperature: float
Sampling temperature
:param kwargs: Any
Additional model_args to pass to the API client
"""
super().__init__()
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
self.model = model
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_token = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
@property
def max_gen_toks(self) -> int:
return self.max_tokens
def generate_until(self, requests) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
if not requests:
return []
_requests: List[Tuple[str, dict]] = [req.args for req in requests]
res = []
for request in tqdm(_requests):
try:
inp = request[0]
request_args = request[1]
# generation_kwargs
until = request_args.get("until")
max_tokens = request_args.get("max_gen_toks", self.max_length)
temperature = request_args.get("temperature", self.temperature)
response = anthropic_chat(
client=self.client,
model=self.model,
prompt=inp,
max_tokens=max_tokens,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
return res
...@@ -13,6 +13,7 @@ from accelerate import ( ...@@ -13,6 +13,7 @@ from accelerate import (
InitProcessGroupKwargs, InitProcessGroupKwargs,
find_executable_batch_size, find_executable_batch_size,
) )
from huggingface_hub import HfApi
from packaging import version from packaging import version
from peft import PeftModel from peft import PeftModel
from peft import __version__ as PEFT_VERSION from peft import __version__ as PEFT_VERSION
...@@ -43,13 +44,13 @@ def _get_accelerate_args( ...@@ -43,13 +44,13 @@ def _get_accelerate_args(
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload", offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict: ) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.""" """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {} max_memory = {}
if max_memory_per_gpu is not None: if max_memory_per_gpu is not None:
max_memory_per_gpu_map = { max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu device_idx: max_memory_per_gpu for device_idx in range(gpus)
for device_idx in range(torch.cuda.device_count())
} }
max_memory.update(max_memory_per_gpu_map) max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None: if max_cpu_memory is not None:
...@@ -77,7 +78,7 @@ class HFLM(TemplateLM): ...@@ -77,7 +78,7 @@ class HFLM(TemplateLM):
def __init__( def __init__(
self, self,
pretrained: Optional[Union[str, transformers.PreTrainedModel]] = "gpt2", pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main", revision: Optional[str] = "main",
...@@ -99,6 +100,7 @@ class HFLM(TemplateLM): ...@@ -99,6 +100,7 @@ class HFLM(TemplateLM):
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True, use_fast_tokenizer: Optional[bool] = True,
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
prefix_token_id: Optional[int] = None,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
...@@ -106,8 +108,9 @@ class HFLM(TemplateLM): ...@@ -106,8 +108,9 @@ class HFLM(TemplateLM):
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[Union[str, os.PathLike]] = "./offload", offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
# PEFT and quantization options # PEFT, delta weights and quantization options
peft: Optional[str] = None, peft: Optional[str] = None,
delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -154,7 +157,7 @@ class HFLM(TemplateLM): ...@@ -154,7 +157,7 @@ class HFLM(TemplateLM):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
["cuda", "cpu"] ["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())] + [f"cuda:{i}" for i in range(gpus)]
+ ["mps", "mps:0"] + ["mps", "mps:0"]
) )
if device and device in device_list: if device and device in device_list:
...@@ -196,6 +199,15 @@ class HFLM(TemplateLM): ...@@ -196,6 +199,15 @@ class HFLM(TemplateLM):
config=self.config, backend=backend, trust_remote_code=trust_remote_code config=self.config, backend=backend, trust_remote_code=trust_remote_code
) )
# load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
# if we passed `pretrained` as a string, initialize our model now # if we passed `pretrained` as a string, initialize our model now
if isinstance(pretrained, str): if isinstance(pretrained, str):
self._create_model( self._create_model(
...@@ -204,11 +216,13 @@ class HFLM(TemplateLM): ...@@ -204,11 +216,13 @@ class HFLM(TemplateLM):
dtype=dtype, dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
parallelize=parallelize, parallelize=parallelize,
gpus=gpus,
device_map_option=device_map_option, device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu, max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory, max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder, offload_folder=offload_folder,
peft=peft, peft=peft,
delta=delta,
autogptq=autogptq, autogptq=autogptq,
**kwargs, **kwargs,
) )
...@@ -231,14 +245,6 @@ class HFLM(TemplateLM): ...@@ -231,14 +245,6 @@ class HFLM(TemplateLM):
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore." "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
) )
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
self.truncation = truncation self.truncation = truncation
self.logits_cache = logits_cache self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
...@@ -275,7 +281,10 @@ class HFLM(TemplateLM): ...@@ -275,7 +281,10 @@ class HFLM(TemplateLM):
) )
self._max_length = max_length self._max_length = max_length
self.pretrained = pretrained
self.delta = delta
self.peft = peft
self.revision = revision
self.batch_schedule = 1 self.batch_schedule = 1
self.batch_sizes = {} self.batch_sizes = {}
self.max_batch_size = max_batch_size self.max_batch_size = max_batch_size
...@@ -322,9 +331,7 @@ class HFLM(TemplateLM): ...@@ -322,9 +331,7 @@ class HFLM(TemplateLM):
self._model = accelerator.prepare_model( self._model = accelerator.prepare_model(
self.model, evaluation_mode=True self.model, evaluation_mode=True
) )
self._device = torch.device( self._device = torch.device(f"{accelerator.device}")
f"cuda:{accelerator.local_process_index}"
)
self.accelerator = accelerator self.accelerator = accelerator
if self.accelerator.is_local_main_process: if self.accelerator.is_local_main_process:
...@@ -340,6 +347,12 @@ class HFLM(TemplateLM): ...@@ -340,6 +347,12 @@ class HFLM(TemplateLM):
self._rank = 0 self._rank = 0
self._world_size = 1 self._world_size = 1
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
@property @property
def config(self): def config(self):
# return the associated transformers.AutoConfig for the given pretrained model. # return the associated transformers.AutoConfig for the given pretrained model.
...@@ -358,6 +371,15 @@ class HFLM(TemplateLM): ...@@ -358,6 +371,15 @@ class HFLM(TemplateLM):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
...@@ -466,12 +488,14 @@ class HFLM(TemplateLM): ...@@ -466,12 +488,14 @@ class HFLM(TemplateLM):
# only used if `parallelize=True`. # only used if `parallelize=True`.
# (accelerate naive PP (device_map) options) # (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload", offload_folder: Optional[str] = "./offload",
# PEFT and quantization options # PEFT, delta weights and quantization options
peft: Optional[str] = None, peft: Optional[str] = None,
delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -496,6 +520,7 @@ class HFLM(TemplateLM): ...@@ -496,6 +520,7 @@ class HFLM(TemplateLM):
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
offload_folder, offload_folder,
gpus,
) )
) )
elif "device_map" not in model_kwargs: elif "device_map" not in model_kwargs:
...@@ -504,9 +529,7 @@ class HFLM(TemplateLM): ...@@ -504,9 +529,7 @@ class HFLM(TemplateLM):
# for quantized models now seems to be device_map="auto" # for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode. # which breaks data-parallel mode.
if hasattr(self, "accelerator"): if hasattr(self, "accelerator"):
model_kwargs.update( model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else: else:
model_kwargs.update({"device_map": {"": str(self.device)}}) model_kwargs.update({"device_map": {"": str(self.device)}})
...@@ -547,12 +570,47 @@ class HFLM(TemplateLM): ...@@ -547,12 +570,47 @@ class HFLM(TemplateLM):
**model_kwargs, **model_kwargs,
) )
if peft and delta:
raise ValueError(
"Cannot use both 'peft' and 'delta' options at the same time."
)
if peft: if peft:
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0" if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
self._model.resize_token_embeddings(len(self.tokenizer))
eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
)
self._model = PeftModel.from_pretrained( self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision self._model, peft, revision=revision
) )
elif delta:
if autogptq:
eval_logger.warning(
"Delta weights might trigger unexpected behavior when used with AutoGPTQ."
)
_model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
delta,
revision=revision,
torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code,
**model_kwargs,
)
for name, param in self._model.state_dict().items():
try:
param.data += _model_delta.state_dict()[name]
except KeyError:
raise KeyError(f"Delta model is missing weights for layer: {name}")
except Exception as e:
raise RuntimeError(
f"Failed to add delta weights to layer {name}. Error: {e}"
)
del _model_delta
return None return None
...@@ -615,6 +673,8 @@ class HFLM(TemplateLM): ...@@ -615,6 +673,8 @@ class HFLM(TemplateLM):
max_cont_enc = len(continuation_enc[-(self.max_length + 1) :]) max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
else: else:
max_length = self.max_length max_length = self.max_length
max_context_enc = max_length
max_cont_enc = max_length
# if OOM, then halves batch_size and tries again # if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size) @find_executable_batch_size(starting_batch_size=self.max_batch_size)
...@@ -664,14 +724,21 @@ class HFLM(TemplateLM): ...@@ -664,14 +724,21 @@ class HFLM(TemplateLM):
self, string: str, left_truncate_len=None, add_special_tokens=None self, string: str, left_truncate_len=None, add_special_tokens=None
) -> List[int]: ) -> List[int]:
""" """ """ """
# default for None - empty dict, use predefined tokenizer param
# used for all models except for CausalLM or predefined value
special_tokens_kwargs = {}
# by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None: if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False or self.add_bos_token special_tokens_kwargs = {
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: "add_special_tokens": False or self.add_bos_token
# TODO: investigate best practices for enc-dec models + special tokens }
add_special_tokens = True # otherwise the method explicitly defines the value
else:
special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
# left-truncate the encoded context to be at most `left_truncate_len` tokens long # left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len: if left_truncate_len:
...@@ -690,17 +757,16 @@ class HFLM(TemplateLM): ...@@ -690,17 +757,16 @@ class HFLM(TemplateLM):
old_padding_side = self.tokenizer.padding_side old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side self.tokenizer.padding_side = padding_side
add_special_tokens = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False or self.add_bos_token add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
encoding = self.tokenizer( encoding = self.tokenizer(
strings, strings,
truncation=truncation, truncation=truncation,
padding="longest", padding="longest",
return_tensors="pt", return_tensors="pt",
add_special_tokens=add_special_tokens, **add_special_tokens,
) )
if left_truncate_len: if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:] encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
...@@ -711,11 +777,8 @@ class HFLM(TemplateLM): ...@@ -711,11 +777,8 @@ class HFLM(TemplateLM):
return encoding["input_ids"], encoding["attention_mask"] return encoding["input_ids"], encoding["attention_mask"]
def tok_decode(self, tokens): def tok_decode(self, tokens, skip_special_tokens=True):
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
return self.tokenizer.decode(tokens)
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
return self.tokenizer.decode(tokens, skip_special_tokens=True)
def _model_call(self, inps, attn_mask=None, labels=None): def _model_call(self, inps, attn_mask=None, labels=None):
""" """
...@@ -811,7 +874,7 @@ class HFLM(TemplateLM): ...@@ -811,7 +874,7 @@ class HFLM(TemplateLM):
utils.make_disjoint_window, utils.make_disjoint_window,
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.eot_token_id, prefix_token=self.prefix_token_id,
max_seq_len=self.max_length, max_seq_len=self.max_length,
context_len=1, context_len=1,
), ),
...@@ -1148,7 +1211,7 @@ class HFLM(TemplateLM): ...@@ -1148,7 +1211,7 @@ class HFLM(TemplateLM):
if "until" in kwargs.keys(): if "until" in kwargs.keys():
until = kwargs.pop("until") until = kwargs.pop("until")
if isinstance(until, str): if isinstance(until, str):
until = [kwargs] until = [until]
elif not isinstance(until, list): elif not isinstance(until, list):
raise ValueError( raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
...@@ -1158,7 +1221,7 @@ class HFLM(TemplateLM): ...@@ -1158,7 +1221,7 @@ class HFLM(TemplateLM):
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
) )
# add EOS token to stop sequences # add EOS token to stop sequences
eos = self.tok_decode(self.eot_token_id) eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
if not until: if not until:
until = [eos] until = [eos]
else: else:
...@@ -1221,3 +1284,44 @@ class HFLM(TemplateLM): ...@@ -1221,3 +1284,44 @@ class HFLM(TemplateLM):
pbar.close() pbar.close()
return res return res
def get_model_info(self) -> dict:
"""
Method to get Hugging Face model information for experiment reproducibility.
"""
def get_model_num_params(model) -> int:
if hasattr(model, "num_parameters"):
return model.num_parameters()
if hasattr(model, "parameters"):
return sum(p.numel() for p in model.parameters())
else:
return -1
def get_model_dtype(model) -> str:
if hasattr(model, "dtype"):
return model.dtype
else:
return ""
def get_model_sha(pretrained: str, revision: str) -> str:
try:
model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
return model_info.sha
except Exception as e:
eval_logger.warn(
f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
)
return ""
model_info = {
"model_num_parameters": get_model_num_params(self._model),
"model_dtype": get_model_dtype(self._model),
"model_revision": self.revision,
"model_sha": get_model_sha(self.pretrained, self.revision),
}
if self.peft:
model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
if self.delta:
model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
return model_info
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import pathlib
from copy import deepcopy
from typing import List, Literal
import filelock
import numpy as np
import torch
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
simple_parse_args_string,
)
def _patch_pretrained_cfg(
pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
):
try:
import omegaconf
except ModuleNotFoundError:
raise Exception(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
)
omegaconf.OmegaConf.set_struct(pretrained_cfg, True)
with omegaconf.open_dict(pretrained_cfg):
attributes_to_update = {
"sequence_parallel": False,
"activations_checkpoint_granularity": None,
"activations_checkpoint_method": None,
"precision": trainer.precision,
"global_batch_size": None,
"tensor_model_parallel_size": tensor_model_parallel_size,
"pipeline_model_parallel_size": pipeline_model_parallel_size,
"apply_rope_fusion": False,
}
for name, value in attributes_to_update.items():
if hasattr(pretrained_cfg, name):
pretrained_cfg[name] = value
return pretrained_cfg
def _get_target_from_class(target_class) -> str:
return f"{target_class.__module__}.{target_class.__name__}"
def load_model(
model_path: str,
trainer,
tensor_model_parallel_size: int,
pipeline_model_parallel_size: int,
) -> torch.nn.Module:
try:
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
MegatronGPTModel,
)
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
except ModuleNotFoundError:
raise Exception(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
)
model_path = pathlib.Path(model_path)
save_restore_connector = NLPSaveRestoreConnector()
if model_path.is_dir():
save_restore_connector.model_extracted_dir = model_path.as_posix()
pretrained_cfg = save_restore_connector.restore_from(
None, model_path.as_posix(), return_config=True, trainer=trainer
)
if not hasattr(pretrained_cfg, "target"):
pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel)
pretrained_cfg = _patch_pretrained_cfg(
pretrained_cfg,
trainer,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
)
model_to_load_path = model_path
override_config = pretrained_cfg
module_name, class_name = override_config.target.rsplit(".", 1)
model_class = getattr(importlib.import_module(module_name), class_name)
# monkeypatch _build_tokenizer method to be process-safe
tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock")
def _synced_build_tokenizer(self):
with tokenizer_lock:
self._original_build_tokenizer()
model_class._original_build_tokenizer = model_class._build_tokenizer
model_class._build_tokenizer = _synced_build_tokenizer
model = model_class.restore_from(
restore_path=model_to_load_path.as_posix(),
trainer=trainer,
override_config_path=override_config,
save_restore_connector=save_restore_connector,
map_location=f"cuda:{trainer.local_rank}",
)
model.freeze()
model.training = False
try:
# Have to turn off activations_checkpoint_method for inference
model.model.language_model.encoder.activations_checkpoint_method = None
except AttributeError:
pass
return model
def setup_distributed_environment(trainer):
try:
from nemo.utils.app_state import AppState
except ModuleNotFoundError:
raise Exception(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
)
def dummy():
return
if trainer.strategy.launcher is not None:
trainer.strategy.launcher.launch(dummy, trainer=trainer)
trainer.strategy.setup_environment()
app_state = AppState()
return app_state
@register_model("nemo_lm")
class NeMoLM(LM):
def __init__(
self,
path: str,
max_length: int = 4096,
batch_size: int = 1,
max_gen_toks: int = 256,
devices: int = 1,
num_nodes: int = 1,
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
precision: Literal[
"16-mixed",
"bf16-mixed",
"32-true",
"64-true",
64,
32,
16,
"64",
"32",
"16",
"bf16",
] = "bf16",
**kwargs,
):
try:
from nemo.collections.nlp.modules.common.text_generation_utils import (
generate,
)
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from pytorch_lightning.trainer.trainer import Trainer
self.generate = generate
except ModuleNotFoundError:
raise Exception(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
)
super().__init__()
if (
tensor_model_parallel_size == 1
and pipeline_model_parallel_size == 1
and devices > 1
):
eval_logger.info(
f"The number of data replicas for evaluation is {devices}."
)
eval_logger.info(f"The total number of devices is {devices}.")
eval_logger.info(
"No tensor parallelism or pipeline parallelism is applied."
)
elif tensor_model_parallel_size * pipeline_model_parallel_size == devices:
eval_logger.info(
f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}."
)
eval_logger.info(f"The total number of devices is {devices}.")
eval_logger.info("No data parallelism is applied.")
else:
raise ValueError(
"Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size"
"equal to the specified number of devices."
)
if num_nodes > 1:
raise ValueError(
"A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1."
)
trainer = Trainer(
strategy=NLPDDPStrategy(),
devices=devices,
accelerator="gpu",
num_nodes=num_nodes,
precision=precision,
logger=False,
enable_checkpointing=False,
use_distributed_sampler=False,
)
# Modify the following flags only for data replication
if (
tensor_model_parallel_size == 1
and pipeline_model_parallel_size == 1
and devices > 1
):
self._device = torch.device(f"cuda:{trainer.global_rank}")
self._rank = trainer.global_rank
self._world_size = trainer.world_size
self.model = load_model(
path,
trainer,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
).cuda()
self.tokenizer = self.model.tokenizer
self.app_state = setup_distributed_environment(trainer)
self._max_length = max_length
self._batch_size = int(batch_size)
self._max_gen_toks = max_gen_toks
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
args = simple_parse_args_string(arg_string)
if additional_config:
args["batch_size"] = additional_config.get("batch_size", 1)
return cls(**args)
@property
def eot_token_id(self):
try:
return self.tokenizer.eos_id
except AttributeError:
return None
@property
def max_length(self):
return self._max_length
@property
def max_gen_toks(self):
return self._max_gen_toks
@property
def batch_size(self):
return self._batch_size
@property
def device(self):
return self._device
@property
def rank(self):
return self._rank
@property
def world_size(self):
return self._world_size
@property
def accelerator(self):
return self._Accelerator(self.world_size)
class _Accelerator:
def __init__(self, world_size):
self.world_size = world_size
def wait_for_everyone(self):
torch.distributed.barrier()
def gather(self, local_tensor):
gathered_tensors = [
torch.zeros(1, dtype=local_tensor.dtype).cuda()
for _ in range(self.world_size)
]
torch.distributed.all_gather(gathered_tensors, local_tensor)
return torch.cat(gathered_tensors)
def tok_encode(self, string: str):
return self.tokenizer.text_to_ids(string)
def tok_decode(self, tokens):
return self.tokenizer.ids_to_text(tokens)
def _encode_pair(self, context, continuation):
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
make_disjoint_window,
get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length - 1,
context_len=1,
),
)
)
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
res = []
def _collate(x):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = Collator(requests, sort_fn=_collate)
chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None)
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running loglikelihood requests",
)
for chunk in chunks:
inps = []
ctxlens = []
contlens = []
for _, context_enc, continuation_enc in chunk:
# Leave one token for generation. Tokens_to_generate = 0 breaks NeMo.
inp = (context_enc + continuation_enc)[-(self.max_length - 1) :]
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length - 1)
)
ctxlens.append(ctxlen)
contlens.append(len(continuation_enc))
inps.append(self.tok_decode(inp))
output = self.generate(
self.model,
inputs=inps,
tokens_to_generate=1,
min_tokens_to_generate=1,
compute_logprob=True,
all_probs=True,
)
batch_token_ids = np.asarray(output["token_ids"])[:, :-1]
batch_logprobs = output["logprob"][:, :-1]
batch_full_logprob = output["full_logprob"][:, :-1, :]
# Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample.
# Additional tokens for each sample will be trimmed later.
min_ctxlen = min(ctxlens)
# Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token.
batch_greedy_tokens = (
torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1)
.cpu()
.numpy()
)
for token_ids, greedy_tokens, logprobs, ctxlen, contlen, (
cache_key,
_,
_,
) in zip(
batch_token_ids,
batch_greedy_tokens,
batch_logprobs,
ctxlens,
contlens,
chunk,
):
# Trim at contlen since shorter contexts in a batch will have more than one token generated.
# Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation
logprobs = (logprobs[ctxlen - 1 :])[:contlen]
logprob = sum(logprobs).tolist()
continuation_tokens = (token_ids[ctxlen:])[:contlen]
len_diff = ctxlen - min_ctxlen
is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen]
if not isinstance(is_greedy, bool):
is_greedy = is_greedy.all()
answer = (logprob, is_greedy)
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def generate_until(self, requests):
if not requests:
return []
res = []
def get_until(req_args):
until = req_args.get("until", [])
until = deepcopy(until) # prevent from modifying req_args for cache_key
if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until:
until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0])
return until
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
re_ords = Collator(
[reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs"
)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
req_args = all_gen_kwargs[0]
# unpack our keyword arguments.
until = get_until(req_args)
max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks)
remaining_length = self.max_length - max_gen_toks
contexts = []
for context, _ in chunk:
encoded_context = self.tok_encode(context)
encoded_context = encoded_context[-remaining_length:]
contexts.append(self.tok_decode(encoded_context))
output = self.generate(
self.model,
inputs=contexts,
tokens_to_generate=max_gen_toks,
end_strings=until,
greedy=True,
)
answers = output["sentences"]
continuations = []
for context, answer in zip(contexts, answers):
continuations.append(answer[len(context) :])
for term in until:
continuations = [answer.split(term)[0] for answer in continuations]
for request, answer in zip(chunk, continuations):
self.cache_hook.add_partial("greedy_until", request, answer)
res.append(answer)
return re_ords.get_original(res)
import copy
from typing import List, Optional, Tuple, Union
import numpy
import transformers
from tqdm import tqdm
import lm_eval.models.utils
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
@register_model("sparseml")
class SparseMLLM(HFLM):
"""
SparseML is an open-source model optimization toolkit that enables you to create
inference-optimized sparse models using pruning, quantization, and distillation
algorithms. Models optimized with SparseML can then be exported to the ONNX format and
deployed with DeepSparse for GPU-class performance on CPU hardware.
This class is a wrapper around the HuggingFace LM class to enable SparseML
integration with the lm-evaluation-harness.
"""
def _create_model(
self,
pretrained: str,
revision: Optional[str] = "main",
dtype: Optional[str] = "auto",
trust_remote_code: Optional[bool] = False,
**kwargs,
) -> None:
try:
from sparseml.transformers import SparseAutoModelForCausalLM
except ModuleNotFoundError:
raise Exception(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
model_kwargs = kwargs if kwargs else {}
if "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update(
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
relevant_kwarg_names = [
"offload_folder",
"device_map",
]
relevant_kwargs = {
k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names
}
# Log the difference between model_kwargs and relevant_kwargs so we can see
# what is being ignored
ignored_kwargs = {}
for k, v in model_kwargs.items():
if k not in relevant_kwargs.keys():
ignored_kwargs[k] = v
eval_logger.warning(
f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}"
)
model = SparseAutoModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
torch_dtype=lm_eval.models.utils.get_dtype(dtype),
trust_remote_code=trust_remote_code,
**relevant_kwargs,
)
self._model = model
def _get_config(self, pretrained: str, **kwargs) -> None:
try:
from sparseml.transformers import SparseAutoConfig
except ModuleNotFoundError:
raise Exception(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
self._config = SparseAutoConfig.from_pretrained(
pretrained_model_name_or_path=pretrained, **kwargs
)
def _create_tokenizer(
self,
pretrained: Union[str, transformers.PreTrainedModel],
tokenizer: Optional[
Union[
str,
transformers.PreTrainedTokenizer,
transformers.PreTrainedTokenizerFast,
]
],
**kwargs,
) -> None:
try:
from sparseml.transformers import SparseAutoTokenizer
except ModuleNotFoundError:
raise Exception(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
if tokenizer:
if isinstance(tokenizer, str):
self.tokenizer = SparseAutoTokenizer.from_pretrained(
tokenizer,
**kwargs,
)
else:
assert isinstance(
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer based on 'pretrained'
if isinstance(pretrained, str):
model_name = pretrained
else:
# get the HF hub name via accessor on model
model_name = self.model.name_or_path
self.tokenizer = SparseAutoTokenizer.from_pretrained(
model_name,
**kwargs,
)
return None
@register_model("deepsparse")
class DeepSparseLM(LM):
"""
Wrapper around DeepSparse, a sparsity-aware deep learning
inference runtime for CPUs, to make it compatible with the
lm-evaluation-harness.
"""
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
pretrained: str,
tokenizer: Optional[
Union[
str,
transformers.PreTrainedTokenizer,
transformers.PreTrainedTokenizerFast,
]
] = None,
batch_size: Optional[Union[int, str]] = 1,
max_gen_toks: Optional[int] = 256,
max_length: Optional[int] = None,
):
super().__init__()
try:
import deepsparse
except ModuleNotFoundError:
raise Exception(
"Package `deepsparse` is not installed. "
"Please install it via `pip install deepsparse[transformers]`"
)
if isinstance(batch_size, str) and not batch_size.isdigit():
eval_logger.warning(
f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. "
"Ignoring and using the default of 1."
)
batch_size = 1
self.batch_size = int(batch_size)
self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH
self._max_gen_toks = max_gen_toks
self.batch_sizes = {}
# Initialize new model and tokenizer instances
self.model = deepsparse.TextGeneration(
model_path=pretrained,
sequence_length=self._max_length,
batch_size=batch_size,
)
self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
self.config = self.model.config
def tok_encode(self, string: str) -> List[int]:
return self.tokenizer.encode(string)
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property
def max_length(self) -> int:
return self._max_length
@property
def max_gen_toks(self) -> int:
return self._max_gen_toks
def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
"""
Copied directly from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
raise NotImplementedError(
"Implementing empty context is not supported yet"
)
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens(
self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
disable_tqdm: bool = False,
) -> List[Tuple[float, bool]]:
"""
The function to compute the loglikelihood of the continuation
tokens given the context tokens.
This function is an adapted version of the original function from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
res = []
def _collate(x):
"""Defines the key for the sorted method"""
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm(
list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
batch_inp = []
batch_cache_key = []
batch_continuation_enc = []
# len(chunk) is the batch_size
for cache_key, context_enc, continuation_enc in chunk:
# how this all works (illustrated on a causal decoder-only setup):
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# model \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
batch_inp.append(self.tokenizer.decode(inp))
batch_cache_key.append(cache_key)
batch_continuation_enc.append(continuation_enc)
response = self.model(
prompt=batch_inp,
max_new_tokens=0,
output_scores=True,
include_prompt_logits=True,
)
for resp, continuation_enc, cache_key in zip(
response.generations, batch_continuation_enc, batch_cache_key
):
# (seq_len, vocab_size)
multi_scores = resp.score
from deepsparse.utils.data import numpy_log_softmax
# (seq_len, vocab_size) but with softmax applied
multi_logits = numpy_log_softmax(multi_scores, axis=1)
# toss out the context half of the sequence
# (cont_len, vocab_size)
continuation_multi_logits = multi_logits[-len(continuation_enc) :]
# pick out the logits for the continuation tokens
# (cont_len,)
continuation_logits = continuation_multi_logits[
numpy.arange(len(continuation_enc)), continuation_enc
]
# check if the tokens generated greedly are the same
# as the expected continuation
greedy_tokens = continuation_multi_logits.argmax(axis=1)
max_equal = greedy_tokens.tolist() == continuation_enc
# Answer: (log prob, is-exact-match)
answer = (float(continuation_logits.sum()), bool(max_equal))
res.append(answer)
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
raise NotImplementedError(
"The method not required by any of our current task integrations so far"
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
The function to generate a certain number of new tokens
given a context.
This function is an adapted version of the original function from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
"""
if not requests:
return []
res = []
requests = [req.args for req in requests]
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size):
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
pbar = tqdm(total=len(requests))
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
):
inps = []
# make a deepcopy since we are changing arguments
request_args = copy.deepcopy(request_args)
self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
# add context (prompts) to the list
inps.append(context)
until = request_args.pop("until", ["<|endoftext|>"])
request_args.pop("do_sample", None)
request_args["temperature"] = request_args.get("temperature", 0)
# run inference (generate max_gen_toks tokens)
out = self.model(
sequences=inps,
max_new_tokens=self.max_gen_toks - 1,
stop=until,
**request_args,
)
for resp, (context, args_) in zip(out.generations, chunk):
text = resp.text
until_ = until
# split the text at the first occurrence of any of the until tokens
for term in until_:
if len(term) > 0:
text = text.split(term)[0]
res.append(text)
self.cache_hook.add_partial(
"generate_until", (context, {"until": until_}), text
)
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
"""
Copied directly from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
...@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM): ...@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
...@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM): ...@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM):
utils.make_disjoint_window, utils.make_disjoint_window,
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.eot_token_id, prefix_token=self.prefix_token_id,
max_seq_len=self.max_length, max_seq_len=self.max_length,
context_len=1, context_len=1,
), ),
...@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM): ...@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM):
if "until" in kwargs.keys(): if "until" in kwargs.keys():
until = kwargs.pop("until") until = kwargs.pop("until")
if isinstance(until, str): if isinstance(until, str):
until = [kwargs] until = [until]
elif not isinstance(until, list): elif not isinstance(until, list):
raise ValueError( raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
......
...@@ -14,13 +14,11 @@ from lm_eval.models.utils import retry_on_specific_exceptions ...@@ -14,13 +14,11 @@ from lm_eval.models.utils import retry_on_specific_exceptions
from lm_eval.utils import eval_logger from lm_eval.utils import eval_logger
def get_result(response, ctxlen: int) -> Tuple[float, bool]: def get_result(response) -> Tuple[float, bool]:
"""Process results from OpenAI API response. """Process results from OpenAI API response.
:param response: dict :param response: dict
OpenAI API Response OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return: :return:
continuation_logprobs: np.array continuation_logprobs: np.array
Log probabilities of continuation tokens Log probabilities of continuation tokens
...@@ -29,9 +27,9 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]: ...@@ -29,9 +27,9 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]:
""" """
is_greedy = True is_greedy = True
logprobs = response.logprobs.token_logprobs logprobs = response.logprobs.token_logprobs
continuation_logprobs = sum(logprobs[ctxlen:]) continuation_logprobs = sum(logprobs)
for i in range(ctxlen, len(response.logprobs.token_logprobs)): for i in range(len(response.logprobs.token_logprobs)):
token = response.logprobs.token_logprobs[i] token = response.logprobs.token_logprobs[i]
top_tokens = response.logprobs.top_logprobs[i] top_tokens = response.logprobs.top_logprobs[i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x]) top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
...@@ -111,7 +109,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -111,7 +109,7 @@ class OpenaiCompletionsLM(TemplateLM):
self.base_url = base_url self.base_url = base_url
self.tokenizer_backend = tokenizer_backend self.tokenizer_backend = tokenizer_backend
self.truncate = truncate self.truncate = truncate
self._batch_size = batch_size self._batch_size = int(batch_size)
self._max_gen_toks = max_gen_toks self._max_gen_toks = max_gen_toks
self._max_length = max_length self._max_length = max_length
...@@ -212,7 +210,6 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -212,7 +210,6 @@ class OpenaiCompletionsLM(TemplateLM):
client=self.client, client=self.client,
model=self.model, model=self.model,
prompt=inps, prompt=inps,
echo=True,
max_tokens=0, max_tokens=0,
temperature=0.0, temperature=0.0,
logprobs=10, logprobs=10,
...@@ -222,7 +219,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -222,7 +219,7 @@ class OpenaiCompletionsLM(TemplateLM):
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip( for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
response.choices, ctxlens, chunk response.choices, ctxlens, chunk
): ):
answer = get_result(resp, ctxlen) answer = get_result(resp)
res.append(answer) res.append(answer)
...@@ -281,7 +278,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -281,7 +278,7 @@ class OpenaiCompletionsLM(TemplateLM):
**{ **{
k: v k: v
for k, v in request_args.items() for k, v in request_args.items()
if k not in ["do_sample", "max_gen_toks"] if k not in {"do_sample", "max_gen_toks", "until"}
}, },
) )
for resp, (context, args_) in zip(response.choices, chunk): for resp, (context, args_) in zip(response.choices, chunk):
...@@ -433,7 +430,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -433,7 +430,7 @@ class OpenaiChatCompletionsLM(LM):
if "until" in kwargs.keys(): if "until" in kwargs.keys():
until = kwargs.pop("until") until = kwargs.pop("until")
if isinstance(until, str): if isinstance(until, str):
until = [kwargs] until = [until]
elif not isinstance(until, list): elif not isinstance(until, list):
raise ValueError( raise ValueError(
f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}" f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
......
import json
from importlib.util import find_spec from importlib.util import find_spec
from pathlib import Path from pathlib import Path
from lm_eval import utils
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
@register_model("openvino") @register_model("openvino")
class OptimumLM(HFLM): class OptimumLM(HFLM):
""" """
Optimum Intel provides a simple interface to optimize Transformer models and convert them to \ Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \ OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
Intel® architectures using OpenVINO™ runtime. Intel® architectures using OpenVINO™ runtime.
To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config:
`lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai`
Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"}
""" """
def __init__( def __init__(
...@@ -48,16 +57,25 @@ class OptimumLM(HFLM): ...@@ -48,16 +57,25 @@ class OptimumLM(HFLM):
from optimum.intel.openvino import OVModelForCausalLM from optimum.intel.openvino import OVModelForCausalLM
model_kwargs = kwargs if kwargs else {} model_kwargs = kwargs if kwargs else {}
if "ov_config" in model_kwargs:
if not Path(model_kwargs["ov_config"]).exists():
raise ValueError(
"ov_config should point to a .json file containing an OpenVINO config"
)
with open(model_kwargs["ov_config"]) as f:
model_kwargs["ov_config"] = json.load(f)
eval_logger.info(
f"Using custom OpenVINO config: {model_kwargs['ov_config']}"
)
else:
model_kwargs["ov_config"] = {}
model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
model_file = Path(pretrained) / "openvino_model.xml" model_file = Path(pretrained) / "openvino_model.xml"
if model_file.exists(): if model_file.exists():
export = False export = False
else: else:
export = True export = True
kwargs["ov_config"] = {
"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1",
"CACHE_DIR": "",
}
self._model = OVModelForCausalLM.from_pretrained( self._model = OVModelForCausalLM.from_pretrained(
pretrained, pretrained,
......
...@@ -21,10 +21,12 @@ from lm_eval.utils import ( ...@@ -21,10 +21,12 @@ from lm_eval.utils import (
try: try:
import ray import ray
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
eval_logger = eval_logger eval_logger = eval_logger
...@@ -34,7 +36,7 @@ class VLLM(TemplateLM): ...@@ -34,7 +36,7 @@ class VLLM(TemplateLM):
def __init__( def __init__(
self, self,
pretrained="gpt2", pretrained: str,
dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto", dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
revision: Optional[str] = None, revision: Optional[str] = None,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
...@@ -42,6 +44,7 @@ class VLLM(TemplateLM): ...@@ -42,6 +44,7 @@ class VLLM(TemplateLM):
tokenizer_mode: Literal["auto", "slow"] = "auto", tokenizer_mode: Literal["auto", "slow"] = "auto",
tokenizer_revision: Optional[str] = None, tokenizer_revision: Optional[str] = None,
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
prefix_token_id: Optional[int] = None,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
quantization: Optional[str] = None, quantization: Optional[str] = None,
max_gen_toks: int = 256, max_gen_toks: int = 256,
...@@ -54,6 +57,7 @@ class VLLM(TemplateLM): ...@@ -54,6 +57,7 @@ class VLLM(TemplateLM):
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
device: str = "cuda", device: str = "cuda",
data_parallel_size: int = 1, data_parallel_size: int = 1,
lora_local_path: str = None,
**kwargs, **kwargs,
): ):
super().__init__() super().__init__()
...@@ -96,9 +100,6 @@ class VLLM(TemplateLM): ...@@ -96,9 +100,6 @@ class VLLM(TemplateLM):
if self.data_parallel_size <= 1: if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args) self.model = LLM(**self.model_args)
else: else:
assert parse_version(version("vllm")) < parse_version(
"0.3.3"
), "data_parallel is only compatible with vllm < v0.3.3."
eval_logger.warning( eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached." "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
) )
...@@ -118,14 +119,36 @@ class VLLM(TemplateLM): ...@@ -118,14 +119,36 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision, tokenizer_revision=tokenizer_revision,
) )
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
self._max_gen_toks = max_gen_toks self._max_gen_toks = max_gen_toks
if lora_local_path is not None:
assert parse_version(version("vllm")) > parse_version(
"0.3.0"
), "lora adapters only compatible with vllm > v0.3.0."
self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
else:
self.lora_request = None
@property @property
def eot_token_id(self): def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
...@@ -208,6 +231,14 @@ class VLLM(TemplateLM): ...@@ -208,6 +231,14 @@ class VLLM(TemplateLM):
# flatten results # flatten results
return undistribute(results) return undistribute(results)
if self.lora_request is not None:
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
)
else:
outputs = self.model.generate( outputs = self.model.generate(
prompt_token_ids=requests, prompt_token_ids=requests,
sampling_params=sampling_params, sampling_params=sampling_params,
......
# v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
...@@ -136,13 +136,14 @@ class TaskManager: ...@@ -136,13 +136,14 @@ class TaskManager:
if "include" in config: if "include" in config:
if yaml_path is None: if yaml_path is None:
raise ValueError raise ValueError
config.update( config = {
utils.load_yaml_config( **utils.load_yaml_config(
yaml_path, yaml_path,
yaml_config={"include": config.pop("include")}, yaml_config={"include": config.pop("include")},
mode="full", mode="full",
) ),
) **config,
}
if self._config_is_python_task(config): if self._config_is_python_task(config):
task_object = config["class"]() task_object = config["class"]()
else: else:
...@@ -356,28 +357,6 @@ class TaskManager: ...@@ -356,28 +357,6 @@ class TaskManager:
return tasks_and_groups return tasks_and_groups
def include_path(task_dir):
logger = utils.eval_logger
logger.setLevel(getattr(logging, "INFO"))
logger.info(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return 0
def initialize_tasks(verbosity="INFO"):
logger = utils.eval_logger
logger.setLevel(getattr(logging, f"{verbosity}"))
logger.info(
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
"It will be removed in v0.4.2 release. "
"TaskManager will instead be used."
)
return 0
def get_task_name_from_config(task_config: Dict[str, str]) -> str: def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "task" in task_config: if "task" in task_config:
return task_config["task"] return task_config["task"]
...@@ -401,7 +380,7 @@ def get_task_name_from_object(task_object): ...@@ -401,7 +380,7 @@ def get_task_name_from_object(task_object):
def get_task_dict( def get_task_dict(
task_name_list: List[Union[str, Dict, Task]], task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
): ):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object. """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
...@@ -423,9 +402,20 @@ def get_task_dict( ...@@ -423,9 +402,20 @@ def get_task_dict(
if isinstance(task_name_list, str): if isinstance(task_name_list, str):
task_name_list = [task_name_list] task_name_list = [task_name_list]
elif isinstance(task_name_list, list):
if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
raise TypeError(
"Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
)
else:
raise TypeError(
f"Expected a 'str' or 'list' but received {type(task_name_list)}."
)
string_task_name_list = [task for task in task_name_list if isinstance(task, str)] string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)] others_task_name_list = [
task for task in task_name_list if not isinstance(task, str)
]
if len(string_task_name_list) > 0: if len(string_task_name_list) > 0:
if task_manager is None: if task_manager is None:
task_manager = TaskManager() task_manager = TaskManager()
......
# ACLUE
### Paper
Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE
https://arxiv.org/abs/2310.09550
The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically
generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks.
Homepage: https://github.com/isen-zhang/ACLUE
### Citation
```bibtex
@inproceedings{zhang-li-2023-large,
title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
author = "Zhang, Yixuan and Li, Haonan",
booktitle = "Proceedings of the Ancient Language Processing Workshop",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.alp-1.9",
pages = "80--87"
}
```
### Groups and Tasks
#### Groups
- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation.
#### Tasks
The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring:
- `aclue_{subject_english}`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aclue
dataset_path: tyouisen/aclue
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.utils import eval_logger
SUBJECTS = {
"古文单字多义": "polysemy_resolution",
"诗词情感分类": "poetry_sentiment_analysis",
"古汉语命名体识别": "named_entity_recognition",
"古汉语知识": "basic_ancient_chinese",
"古诗词上下句预测": "poetry_context_prediction",
"古文断句": "sentence_segmentation",
"对联": "couplet_prediction",
"古诗词曲鉴赏": "poetry_appreciate",
"国学常识": "ancient_chinese_culture",
"古音学": "ancient_phonetics",
"通假字": "homographic_character_resolution",
"古代文学知识": "ancient_literature",
"医古文": "ancient_medical",
"古诗词质量评估": "poetry_quality_assessment",
"古文阅读理解": "reading_comprehension",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="aclue")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
for subject_zh, subject_eng in tqdm(SUBJECTS.items()):
if args.cot_prompt_path is not None:
description = cot_file[subject_eng]
else:
description = (
f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n"
)
yaml_dict = {
"include": base_yaml_name,
"task": f"aclue_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"aclue_{subject_eng}",
"dataset_name": subject_eng,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "ancient_chinese_culture"
"description": "以下是关于国学常识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_chinese_culture"
"dataset_name": "ancient_literature"
"description": "以下是关于古代文学知识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_literature"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment