"dgl_sparse/include/vscode:/vscode.git/clone" did not exist on "0698e91a0e4b40bd4a5a4e59205d098e1bb3d3c9"
Unverified Commit c2bf7f32 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into cmmlu

parents 26621176 3ccea2b2
FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04 FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
### Install python 3.10 and set it as default python interpreter ### Install python 3.10 and set it as default python interpreter
RUN apt update && apt install software-properties-common -y && \ RUN apt update && apt install software-properties-common -y && \
add-apt-repository ppa:deadsnakes/ppa -y && apt update && \ add-apt-repository ppa:deadsnakes/ppa -y && apt update && \
apt install curl -y && \ apt install curl -y && \
...@@ -13,7 +13,7 @@ curl -Ss https://bootstrap.pypa.io/get-pip.py | python3.10 && \ ...@@ -13,7 +13,7 @@ curl -Ss https://bootstrap.pypa.io/get-pip.py | python3.10 && \
apt-get clean && rm -rf /var/lib/apt/lists/ apt-get clean && rm -rf /var/lib/apt/lists/
### Copy files ### Copy files
COPY . /lm-evaluation-harness/ COPY . /lm-evaluation-harness/
### Set working directory ### Set working directory
...@@ -22,9 +22,6 @@ WORKDIR /lm-evaluation-harness ...@@ -22,9 +22,6 @@ WORKDIR /lm-evaluation-harness
### Install requirements ### Install requirements
RUN pip install --no-cache-dir -e . RUN pip install --no-cache-dir -e .
### Run bash ### Run bash
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by: ...@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive, 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information. 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A shell command to run the task in the `master` branch, and what the score is - A shell command to run the task in the `master` branch, and what the score is
- A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations. - A shell command to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
......
...@@ -309,7 +309,9 @@ class BaseLM(LM): ...@@ -309,7 +309,9 @@ class BaseLM(LM):
if override_bs is not None if override_bs is not None
else 0, else 0,
fn=_batch_scheduler fn=_batch_scheduler
if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None, else None,
): ):
inps = [] inps = []
...@@ -375,7 +377,9 @@ class BaseLM(LM): ...@@ -375,7 +377,9 @@ class BaseLM(LM):
# Slice to original seq length # Slice to original seq length
contlen = len(cont_toks) contlen = len(cont_toks)
inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger inplen = inplen + (
logits.shape[0] - padding_length
) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
logits = logits[inplen - contlen : inplen].unsqueeze( logits = logits[inplen - contlen : inplen].unsqueeze(
0 0
) # [1, seq, vocab] ) # [1, seq, vocab]
......
...@@ -74,14 +74,19 @@ def simple_evaluate( ...@@ -74,14 +74,19 @@ def simple_evaluate(
if model_args is None: if model_args is None:
model_args = "" model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string( lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device} model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
) )
elif isinstance(model, transformers.PreTrainedModel): elif isinstance(model, transformers.PreTrainedModel):
lm = lm_eval.models.get_model("hf-causal")( lm = lm_eval.models.get_model("hf-causal")(
pretrained=model, pretrained=model,
batch_size=batch_size, batch_size=batch_size,
max_batch_size=max_batch_size, max_batch_size=max_batch_size,
) )
no_cache = True no_cache = True
else: else:
assert isinstance(model, lm_eval.base.LM) assert isinstance(model, lm_eval.base.LM)
...@@ -125,7 +130,9 @@ def simple_evaluate( ...@@ -125,7 +130,9 @@ def simple_evaluate(
"model_args": model_args, "model_args": model_args,
"num_fewshot": num_fewshot, "num_fewshot": num_fewshot,
"batch_size": batch_size, "batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [], "batch_sizes": list(lm.batch_sizes.values())
if hasattr(lm, "batch_sizes")
else [],
"device": device, "device": device,
"no_cache": no_cache, "no_cache": no_cache,
"limit": limit, "limit": limit,
......
...@@ -4,9 +4,7 @@ from typing import Optional, Union ...@@ -4,9 +4,7 @@ from typing import Optional, Union
from lm_eval.base import BaseLM from lm_eval.base import BaseLM
def _get_dtype( def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
dtype: Union[str, torch.dtype]
) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto": if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16` # Convert `str` args torch dtype: `float16` -> `torch.float16`
...@@ -33,11 +31,10 @@ class HFLM(BaseLM): ...@@ -33,11 +31,10 @@ class HFLM(BaseLM):
max_length=None, max_length=None,
load_in_8bit: Optional[bool] = False, load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
dtype: Optional[Union[str, torch.dtype]]="auto", dtype: Optional[Union[str, torch.dtype]] = "auto",
): ):
super().__init__() super().__init__()
# Initialize model # Initialize model
if isinstance(pretrained, transformers.PreTrainedModel): if isinstance(pretrained, transformers.PreTrainedModel):
self.model = pretrained self.model = pretrained
...@@ -45,28 +42,25 @@ class HFLM(BaseLM): ...@@ -45,28 +42,25 @@ class HFLM(BaseLM):
if tokenizer: if tokenizer:
assert isinstance( assert isinstance(
tokenizer, tokenizer, transformers.PreTrainedTokenizer
transformers.PreTrainedTokenizer ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
) or isinstance(
tokenizer,
transformers.PreTrainedTokenizerFast
)
self.tokenizer = tokenizer self.tokenizer = tokenizer
else: else:
# Get tokenizer # Get tokenizer
model_name = self.model.name_or_path model_name = self.model.name_or_path
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name, model_name,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
elif isinstance(pretrained, str): elif isinstance(pretrained, str):
# Initialize device # Initialize device
assert isinstance(device, str) assert isinstance(device, str)
device_list = set( device_list = set(
["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())] ["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
) )
if device and device in device_list: if device and device in device_list:
self._device = torch.device(device) self._device = torch.device(device)
...@@ -83,21 +77,23 @@ class HFLM(BaseLM): ...@@ -83,21 +77,23 @@ class HFLM(BaseLM):
# Initialize new model and tokenizer instances # Initialize new model and tokenizer instances
self.model = transformers.AutoModelForCausalLM.from_pretrained( self.model = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, pretrained,
load_in_8bit=load_in_8bit, load_in_8bit=load_in_8bit,
low_cpu_mem_usage=low_cpu_mem_usage, low_cpu_mem_usage=low_cpu_mem_usage,
revision=revision, revision=revision,
torch_dtype=_get_dtype(dtype), torch_dtype=_get_dtype(dtype),
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
).to(self.device) ).to(self.device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer if tokenizer else pretrained, tokenizer if tokenizer else pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
else: else:
raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel') raise TypeError(
"Parameter pretrained should be of type str or transformers.PreTrainedModel"
)
self.model.eval() self.model.eval()
...@@ -124,7 +120,7 @@ class HFLM(BaseLM): ...@@ -124,7 +120,7 @@ class HFLM(BaseLM):
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
return self._max_length return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs: for attr in seqlen_config_attrs:
...@@ -136,7 +132,6 @@ class HFLM(BaseLM): ...@@ -136,7 +132,6 @@ class HFLM(BaseLM):
return self.tokenizer.model_max_length return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
@property @property
def max_gen_toks(self): def max_gen_toks(self):
return 256 return 256
...@@ -171,8 +166,10 @@ class HFLM(BaseLM): ...@@ -171,8 +166,10 @@ class HFLM(BaseLM):
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
generation_kwargs = {"do_sample": False, "max_length": max_length} generation_kwargs = {"do_sample": False, "max_length": max_length}
if eos_token_id is not None: if eos_token_id is not None:
generation_kwargs['eos_token_id'] = eos_token_id generation_kwargs["eos_token_id"] = eos_token_id
generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token generation_kwargs[
"pad_token_id"
] = eos_token_id # setting eos_token_id as pad token
return self.model.generate(context, **generation_kwargs) return self.model.generate(context, **generation_kwargs)
......
...@@ -198,14 +198,13 @@ class GPT3LM(BaseLM): ...@@ -198,14 +198,13 @@ class GPT3LM(BaseLM):
context_enc = self.tok_encode(context) context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks) :] inp = context_enc[-(self.max_length - self.max_gen_toks) :]
inps.append(inp) inps.append(inp)
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=inps, prompt=inps,
max_tokens=self.max_gen_toks, max_tokens=self.max_gen_toks,
temperature=0.0, temperature=0.0,
logprobs=10, logprobs=10,
stop=until, stop=until["until"],
) )
for resp, (context, until_) in zip(response.choices, chunk): for resp, (context, until_) in zip(response.choices, chunk):
......
...@@ -19,7 +19,6 @@ _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.dev ...@@ -19,7 +19,6 @@ _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.dev
def _get_accelerate_args( def _get_accelerate_args(
low_cpu_mem_usage: Optional[bool] = True,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -39,7 +38,6 @@ def _get_accelerate_args( ...@@ -39,7 +38,6 @@ def _get_accelerate_args(
args = {} args = {}
if max_memory: if max_memory:
args["max_memory"] = max_memory args["max_memory"] = max_memory
args["low_cpu_mem_usage"] = low_cpu_mem_usage
args["device_map"] = device_map_option args["device_map"] = device_map_option
args["offload_folder"] = offload_folder args["offload_folder"] = offload_folder
return args return args
...@@ -94,6 +92,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -94,6 +92,7 @@ class HuggingFaceAutoLM(BaseLM):
load_in_4bit: Optional[bool] = False, load_in_4bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
gptq_use_triton: Optional[bool] = False, gptq_use_triton: Optional[bool] = False,
inject_fused_attention: Optional[bool] = True,
bnb_4bit_quant_type: Optional[str] = None, bnb_4bit_quant_type: Optional[str] = None,
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None, bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
bnb_4bit_use_double_quant: Optional[bool] = False, bnb_4bit_use_double_quant: Optional[bool] = False,
...@@ -160,6 +159,8 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -160,6 +159,8 @@ class HuggingFaceAutoLM(BaseLM):
If True, will trust the remote code when loading the model. If True, will trust the remote code when loading the model.
gptq_use_triton (bool, optional, defaults to False): gptq_use_triton (bool, optional, defaults to False):
Use Triton for GPTQ inference. Use Triton for GPTQ inference.
inject_fused_attention (bool, optional, defaults to True):
Inject fused attention into GPTQ model.
bnb_4bit_quant_type (str, optional, defaults to None): bnb_4bit_quant_type (str, optional, defaults to None):
The quantization type to use for BnB 4bit quantization. See: The quantization type to use for BnB 4bit quantization. See:
https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L77 https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L77
...@@ -219,7 +220,6 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -219,7 +220,6 @@ class HuggingFaceAutoLM(BaseLM):
model_kwargs = {} model_kwargs = {}
if use_accelerate: if use_accelerate:
model_kwargs = _get_accelerate_args( model_kwargs = _get_accelerate_args(
low_cpu_mem_usage,
device_map_option, device_map_option,
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
...@@ -233,11 +233,13 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -233,11 +233,13 @@ class HuggingFaceAutoLM(BaseLM):
subfolder=subfolder, subfolder=subfolder,
torch_dtype=_get_dtype(dtype, self._config), torch_dtype=_get_dtype(dtype, self._config),
gptq_use_triton=gptq_use_triton, gptq_use_triton=gptq_use_triton,
inject_fused_attention=inject_fused_attention,
load_in_8bit=load_in_8bit, load_in_8bit=load_in_8bit,
load_in_4bit=load_in_4bit, load_in_4bit=load_in_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
low_cpu_mem_usage=low_cpu_mem_usage,
**model_kwargs, **model_kwargs,
) )
# note: peft_path can be different than pretrained model path # note: peft_path can be different than pretrained model path
...@@ -262,7 +264,9 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -262,7 +264,9 @@ class HuggingFaceAutoLM(BaseLM):
try: try:
self.model.to(self._device) self.model.to(self._device)
except: except:
print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.") print(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
def _create_auto_model( def _create_auto_model(
self, self,
...@@ -280,6 +284,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -280,6 +284,7 @@ class HuggingFaceAutoLM(BaseLM):
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
torch_dtype: Optional[Union[str, torch.dtype]] = None, torch_dtype: Optional[Union[str, torch.dtype]] = None,
gptq_use_triton: Optional[bool] = False, gptq_use_triton: Optional[bool] = False,
inject_fused_attention: Optional[bool] = True,
bnb_4bit_quant_type: Optional[str] = None, bnb_4bit_quant_type: Optional[str] = None,
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None, bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
bnb_4bit_use_double_quant: Optional[bool] = False, bnb_4bit_use_double_quant: Optional[bool] = False,
...@@ -287,7 +292,9 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -287,7 +292,9 @@ class HuggingFaceAutoLM(BaseLM):
"""Returns a pre-trained pytorch model from a pre-trained model configuration.""" """Returns a pre-trained pytorch model from a pre-trained model configuration."""
if not quantized: if not quantized:
if load_in_4bit: if load_in_4bit:
assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0" assert (
transformers.__version__ >= "4.30.0"
), "load_in_4bit requires transformers >= 4.30.0"
model_kwargs = {} model_kwargs = {}
if transformers.__version__ >= "4.30.0": if transformers.__version__ >= "4.30.0":
model_kwargs["load_in_4bit"] = load_in_4bit model_kwargs["load_in_4bit"] = load_in_4bit
...@@ -295,9 +302,13 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -295,9 +302,13 @@ class HuggingFaceAutoLM(BaseLM):
if bnb_4bit_quant_type: if bnb_4bit_quant_type:
model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
if bnb_4bit_compute_dtype: if bnb_4bit_compute_dtype:
model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype) model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
bnb_4bit_compute_dtype
)
if bnb_4bit_use_double_quant: if bnb_4bit_use_double_quant:
model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant model_kwargs[
"bnb_4bit_use_double_quant"
] = bnb_4bit_use_double_quant
model = self.AUTO_MODEL_CLASS.from_pretrained( model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""), revision=revision + ("/" + subfolder if subfolder is not None else ""),
...@@ -312,15 +323,19 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -312,15 +323,19 @@ class HuggingFaceAutoLM(BaseLM):
) )
else: else:
from auto_gptq import AutoGPTQForCausalLM from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized( model = AutoGPTQForCausalLM.from_quantized(
pretrained, pretrained,
model_basename=None if quantized == True else Path(quantized).stem, model_basename=None if quantized == True else Path(quantized).stem,
device_map=device_map, device_map=device_map,
max_memory=max_memory, max_memory=max_memory,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
use_safetensors=True if quantized == True else quantized.endswith('.safetensors'), use_safetensors=True
if quantized == True
else quantized.endswith(".safetensors"),
use_triton=gptq_use_triton, use_triton=gptq_use_triton,
warmup_triton=gptq_use_triton, warmup_triton=gptq_use_triton,
inject_fused_attention=inject_fused_attention,
) )
return model return model
......
...@@ -20,6 +20,7 @@ from . import swag ...@@ -20,6 +20,7 @@ from . import swag
from . import openbookqa from . import openbookqa
from . import squad from . import squad
from . import naturalqs from . import naturalqs
from . import nqopen
from . import sat from . import sat
from . import arithmetic from . import arithmetic
from . import lambada from . import lambada
...@@ -151,6 +152,7 @@ TASK_REGISTRY = { ...@@ -151,6 +152,7 @@ TASK_REGISTRY = {
"squad2": squad.SQuAD2, "squad2": squad.SQuAD2,
"race": race.RACE, "race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet # "naturalqs": naturalqs.NaturalQs, # not implemented yet
"nq_open": nqopen.NQOpen,
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs, "headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn, "headqa_en": headqa.HeadQAEn,
...@@ -328,11 +330,11 @@ TASK_REGISTRY = { ...@@ -328,11 +330,11 @@ TASK_REGISTRY = {
"csatqa_rch": csatqa.RCH, "csatqa_rch": csatqa.RCH,
"csatqa_li": csatqa.LI, "csatqa_li": csatqa.LI,
"haerae_hi": haerae.HI, "haerae_hi": haerae.HI,
"haerae_kgk":haerae.KGK, "haerae_kgk": haerae.KGK,
"haerae_lw":haerae.LW, "haerae_lw": haerae.LW,
"haerae_rc":haerae.RC, "haerae_rc": haerae.RC,
"haerae_rw":haerae.RW, "haerae_rw": haerae.RW,
"haerae_sn":haerae.SN, "haerae_sn": haerae.SN,
# Requires manual download # Requires manual download
# Requires manual download of data. # Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016, # "storycloze_2016": storycloze.StoryCloze2016,
......
...@@ -16,6 +16,7 @@ _CITATION = """ ...@@ -16,6 +16,7 @@ _CITATION = """
} }
""" """
class Babi(Task): class Babi(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "Muennighoff/babi" DATASET_PATH = "Muennighoff/babi"
...@@ -43,18 +44,16 @@ class Babi(Task): ...@@ -43,18 +44,16 @@ class Babi(Task):
return self.dataset["test"] return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ( return doc["passage"] + doc["question"]
doc['passage'] + doc['question']
)
def should_decontaminate(self): def should_decontaminate(self):
return False # TODO Necessary? return False # TODO Necessary?
def doc_to_decontamination_query(self, doc): def doc_to_decontamination_query(self, doc):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['answer'] return " " + doc["answer"]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
......
...@@ -12,7 +12,7 @@ from lm_eval.base import MultipleChoiceTask ...@@ -12,7 +12,7 @@ from lm_eval.base import MultipleChoiceTask
_CITATION = """ _CITATION = """
@article{huang2023ceval, @article{huang2023ceval,
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian}, author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
journal={arXiv preprint arXiv:2305.08322}, journal={arXiv preprint arXiv:2305.08322},
year={2023} year={2023}
...@@ -21,58 +21,58 @@ _CITATION = """ ...@@ -21,58 +21,58 @@ _CITATION = """
SUBJECTS = { SUBJECTS = {
"computer_network":"计算机网络", "computer_network": "计算机网络",
"operating_system":"操作系统", "operating_system": "操作系统",
"computer_architecture":"计算机组成", "computer_architecture": "计算机组成",
"college_programming":"大学编程", "college_programming": "大学编程",
"college_physics":"大学物理", "college_physics": "大学物理",
"college_chemistry":"大学化学", "college_chemistry": "大学化学",
"advanced_mathematics":"高等数学", "advanced_mathematics": "高等数学",
"probability_and_statistics":"概率统计", "probability_and_statistics": "概率统计",
"discrete_mathematics":"离散数学", "discrete_mathematics": "离散数学",
"electrical_engineer":"注册电气工程师", "electrical_engineer": "注册电气工程师",
"metrology_engineer":"注册计量师", "metrology_engineer": "注册计量师",
"high_school_mathematics":"高中数学", "high_school_mathematics": "高中数学",
"high_school_physics":"高中物理", "high_school_physics": "高中物理",
"high_school_chemistry":"高中化学", "high_school_chemistry": "高中化学",
"high_school_biology":"高中生物", "high_school_biology": "高中生物",
"middle_school_mathematics":"初中数学", "middle_school_mathematics": "初中数学",
"middle_school_biology":"初中生物", "middle_school_biology": "初中生物",
"middle_school_physics":"初中物理", "middle_school_physics": "初中物理",
"middle_school_chemistry":"初中化学", "middle_school_chemistry": "初中化学",
"veterinary_medicine":"兽医学", "veterinary_medicine": "兽医学",
"college_economics":"大学经济学", "college_economics": "大学经济学",
"business_administration":"工商管理", "business_administration": "工商管理",
"marxism":"马克思主义基本原理", "marxism": "马克思主义基本原理",
"mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论", "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
"education_science":"教育学", "education_science": "教育学",
"teacher_qualification":"教师资格", "teacher_qualification": "教师资格",
"high_school_politics":"高中政治", "high_school_politics": "高中政治",
"high_school_geography":"高中地理", "high_school_geography": "高中地理",
"middle_school_politics":"初中政治", "middle_school_politics": "初中政治",
"middle_school_geography":"初中地理", "middle_school_geography": "初中地理",
"modern_chinese_history":"近代史纲要", "modern_chinese_history": "近代史纲要",
"ideological_and_moral_cultivation":"思想道德修养与法律基础", "ideological_and_moral_cultivation": "思想道德修养与法律基础",
"logic":"逻辑学", "logic": "逻辑学",
"law":"法学", "law": "法学",
"chinese_language_and_literature":"中国语言文学", "chinese_language_and_literature": "中国语言文学",
"art_studies":"艺术学", "art_studies": "艺术学",
"professional_tour_guide":"导游资格", "professional_tour_guide": "导游资格",
"legal_professional":"法律职业资格", "legal_professional": "法律职业资格",
"high_school_chinese":"高中语文", "high_school_chinese": "高中语文",
"high_school_history":"高中历史", "high_school_history": "高中历史",
"middle_school_history":"初中历史", "middle_school_history": "初中历史",
"civil_servant":"公务员", "civil_servant": "公务员",
"sports_science":"体育学", "sports_science": "体育学",
"plant_protection":"植物保护", "plant_protection": "植物保护",
"basic_medicine":"基础医学", "basic_medicine": "基础医学",
"clinical_medicine":"临床医学", "clinical_medicine": "临床医学",
"urban_and_rural_planner":"注册城乡规划师", "urban_and_rural_planner": "注册城乡规划师",
"accountant":"注册会计师", "accountant": "注册会计师",
"fire_engineer":"注册消防工程师", "fire_engineer": "注册消防工程师",
"environmental_impact_assessment_engineer":"环境影响评价工程师", "environmental_impact_assessment_engineer": "环境影响评价工程师",
"tax_accountant":"税务师", "tax_accountant": "税务师",
"physician":"医师资格" "physician": "医师资格",
} }
...@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask): ...@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): if self.has_validation_docs():
return map(self._process_doc,self.dataset["val"]) return map(self._process_doc, self.dataset["val"])
def test_docs(self): def test_docs(self):
if self.has_test_docs(): if self.has_test_docs():
return map(self._process_doc,self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _format_subject(self, subject): def _format_subject(self, subject):
words = subject.split("_") words = subject.split("_")
...@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):
def fewshot_context(self, doc, num_fewshot, **kwargs): def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME subject = self.DATASET_NAME
description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。" description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。"
kwargs["description"] = description kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs) return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
...@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
""" """
question = doc["question"].strip() question = doc["question"].strip()
choices = "".join( choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
[f'{key}. {doc[key]}\n' for key in keys]
)
prompt = f"{question}\n{choices}答案:" prompt = f"{question}\n{choices}答案:"
return prompt return prompt
...@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
return { return {
"query": format_example(doc, keys), "query": format_example(doc, keys),
"choices": keys, "choices": keys,
"gold": ord(doc["answer"])-ord("A"), "gold": ord(doc["answer"]) - ord("A"),
} }
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
......
...@@ -109,16 +109,16 @@ SUBJECT_MAPPING = { ...@@ -109,16 +109,16 @@ SUBJECT_MAPPING = {
"chinese_driving_rule": "中国驾驶规则", "chinese_driving_rule": "中国驾驶规则",
"chinese_food_culture": "中国饮食文化", "chinese_food_culture": "中国饮食文化",
"chinese_foreign_policy": "中国外交政策", "chinese_foreign_policy": "中国外交政策",
"chinese_history":"中国历史", "chinese_history": "中国历史",
"chinese_literature": "中国文学", "chinese_literature": "中国文学",
"chinese_teacher_qualification": "中国教师资格", "chinese_teacher_qualification": "中国教师资格",
"clinical_knowledge": "临床知识", "clinical_knowledge": "临床知识",
"college_actuarial_science":"大学精算学", "college_actuarial_science": "大学精算学",
"college_education":"大学教育学", "college_education": "大学教育学",
"college_engineering_hydrology": "大学工程水文学", "college_engineering_hydrology": "大学工程水文学",
"college_law": "大学法律", "college_law": "大学法律",
"college_mathematics": "大学数学", "college_mathematics": "大学数学",
"college_medical_statistics":"大学医学统计", "college_medical_statistics": "大学医学统计",
"college_medicine": "大学医学", "college_medicine": "大学医学",
"computer_science": "计算机科学", "computer_science": "计算机科学",
"computer_security": "计算机安全", "computer_security": "计算机安全",
...@@ -127,8 +127,8 @@ SUBJECT_MAPPING = { ...@@ -127,8 +127,8 @@ SUBJECT_MAPPING = {
"economics": "经济学", "economics": "经济学",
"education": "教育学", "education": "教育学",
"electrical_engineering": "电气工程", "electrical_engineering": "电气工程",
"elementary_chinese":"小学语文", "elementary_chinese": "小学语文",
"elementary_commonsense":"小学常识", "elementary_commonsense": "小学常识",
"elementary_information_and_technology": "小学信息技术", "elementary_information_and_technology": "小学信息技术",
"elementary_mathematics": "初等数学", "elementary_mathematics": "初等数学",
"ethnology": "民族学", "ethnology": "民族学",
...@@ -159,12 +159,12 @@ SUBJECT_MAPPING = { ...@@ -159,12 +159,12 @@ SUBJECT_MAPPING = {
"professional_medicine": "专业医学", "professional_medicine": "专业医学",
"professional_psychology": "专业心理学", "professional_psychology": "专业心理学",
"public_relations": "公共关系", "public_relations": "公共关系",
"security_study":"安全研究", "security_study": "安全研究",
"sociology": "社会学", "sociology": "社会学",
"sports_science": "体育学", "sports_science": "体育学",
"traditional_chinese_medicine": "中医中药", "traditional_chinese_medicine": "中医中药",
"virology": "病毒学", "virology": "病毒学",
"world_history":"世界历史", "world_history": "世界历史",
"world_religions": "世界宗教", "world_religions": "世界宗教",
} }
......
...@@ -16,7 +16,7 @@ class CSATQA(MultipleChoiceTask): ...@@ -16,7 +16,7 @@ class CSATQA(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return map(self._process_doc, self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc): def _process_doc(self, doc):
instruction = f"""다음을 읽고 정답으로 알맞은 것을 고르시요. instruction = f"""다음을 읽고 정답으로 알맞은 것을 고르시요.
### Context: {doc["context"]} ### Context: {doc["context"]}
...@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask): ...@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']} (1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
### Answer: 주어진 문제의 정답은""" ### Answer: 주어진 문제의 정답은"""
choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]] choices = [
doc["option#1"],
doc["option#2"],
doc["option#3"],
doc["option#4"],
doc["option#5"],
]
out_doc = { out_doc = {
"question": instruction, "question": instruction,
"choices": ["(1)", "(2)","(3)","(4)","(5)"], "choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
"gold": int(doc['gold'])-1, "gold": int(doc["gold"]) - 1,
} }
return out_doc return out_doc
...@@ -40,18 +46,23 @@ class CSATQA(MultipleChoiceTask): ...@@ -40,18 +46,23 @@ class CSATQA(MultipleChoiceTask):
class WR(CSATQA): class WR(CSATQA):
DATASET_NAME = "WR" DATASET_NAME = "WR"
class GR(CSATQA): class GR(CSATQA):
DATASET_NAME = "GR" DATASET_NAME = "GR"
class RCS(CSATQA): class RCS(CSATQA):
DATASET_NAME = "RCS" DATASET_NAME = "RCS"
class RCSS(CSATQA): class RCSS(CSATQA):
DATASET_NAME = "RCSS" DATASET_NAME = "RCSS"
class RCH(CSATQA): class RCH(CSATQA):
DATASET_NAME = "RCH" DATASET_NAME = "RCH"
class LI(CSATQA): class LI(CSATQA):
DATASET_NAME = "LI" DATASET_NAME = "LI"
...@@ -16,7 +16,7 @@ class Haerae(MultipleChoiceTask): ...@@ -16,7 +16,7 @@ class Haerae(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return map(self._process_doc, self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc): def _process_doc(self, doc):
choices = [doc["o1"], doc["o2"], doc["o3"], doc["o4"]] choices = [doc["o1"], doc["o2"], doc["o3"], doc["o4"]]
if doc.get("o5") is not None: if doc.get("o5") is not None:
...@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask): ...@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
out_doc = { out_doc = {
"query": doc["query"], "query": doc["query"],
"choices": choices, "choices": choices,
"gold": int(doc['gold'])-1, "gold": int(doc["gold"]) - 1,
} }
return out_doc return out_doc
......
"""
Latent Retrieval for Weakly Supervised Open Domain Question Answering
https://arxiv.org/pdf/1906.00300.pdf
Natural Questions: a Benchmark for Question Answering Research
https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf
The NQ-Open task, introduced by Lee et. al. 2019, is an open-domain question
answering benchmark that is derived from Natural Questions. The goal is to predict
an English answer string for an input English question. All questions can be
answered using the contents of English Wikipedia.
Homepage: https://github.com/google-research-datasets/natural-questions/tree/master/nq_open
"""
import regex
import string
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
_CITATION = """
@inproceedings{lee-etal-2019-latent,
title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering",
author = "Lee, Kenton and
Chang, Ming-Wei and
Toutanova, Kristina",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1612",
doi = "10.18653/v1/P19-1612",
pages = "6086--6096",
abstract = "Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.",
}
"""
class NQOpen(Task):
VERSION = 0
DATASET_PATH = "nq_open"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
return self.dataset["train"]
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
raise NotImplementedError()
def doc_to_text(self, doc):
return f"Q: {doc['question']}\nA:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc):
return " " + doc["answer"][0]
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation = rf.greedy_until(ctx, {"until": ["\n", ".", ","]})
return continuation
def _normalize_answer(self, text):
# Lowercase and remove punctuation, strip whitespace
text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
# Remove articles, resulting in duplicate whitespace
text = regex.sub(r"\b(a|an|the)\b", " ", text)
# Remove duplicate whitespace
text = " ".join(text.split())
return text
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
continuation = self._normalize_answer(results[0])
answers = [self._normalize_answer(answer) for answer in doc["answer"]]
return {"em": float(continuation in answers)}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"em": mean,
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"em": True,
}
...@@ -33,34 +33,43 @@ _CITATION = """ ...@@ -33,34 +33,43 @@ _CITATION = """
class Pubmed_QA(Task): class Pubmed_QA(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "pubmed_qa" DATASET_PATH = "bigbio/pubmed_qa"
DATASET_NAME = "pqa_labeled" DATASET_NAME = "pubmed_qa_labeled_fold0_source"
def has_training_docs(self): def has_training_docs(self):
return False return True
def has_validation_docs(self): def has_validation_docs(self):
return False return True
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = self.dataset["train"]
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self): def test_docs(self):
if self.has_test_docs(): if self.has_test_docs():
# HF is labelled as train but its really just for testing return self.dataset["test"]
return self.dataset["train"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["CONTEXTS"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, doc["question"], doc["final_decision"] ctxs, doc["QUESTION"], doc["final_decision"]
) )
def should_decontaminate(self): def should_decontaminate(self):
return True return True
def doc_to_decontamination_query(self, doc): def doc_to_decontamination_query(self, doc):
return doc["question"] + " " + "\n".join(doc["context"]["contexts"]) return doc["question"] + " " + "\n".join(doc["CONTEXTS"])
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(doc["final_decision"]) return " {}".format(doc["final_decision"])
......
...@@ -42,7 +42,7 @@ import re ...@@ -42,7 +42,7 @@ import re
_CITATION = """ _CITATION = """
@inproceedings{shaham-etal-2022-scrolls, @inproceedings{shaham-etal-2022-scrolls,
title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences", title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
author = "Shaham, Uri and author = "Shaham, Uri and
Segal, Elad and Segal, Elad and
Ivgi, Maor and Ivgi, Maor and
Efrat, Avia and Efrat, Avia and
...@@ -72,9 +72,14 @@ def _download_metric(): ...@@ -72,9 +72,14 @@ def _download_metric():
import os import os
import shutil import shutil
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
scrolls_metric_path = hf_hub_download(
repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
)
updated_scrolls_metric_path = ( updated_scrolls_metric_path = (
os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py" os.path.dirname(scrolls_metric_path)
+ os.path.basename(scrolls_metric_path).replace(".", "_")
+ ".py"
) )
shutil.copy(scrolls_metric_path, updated_scrolls_metric_path) shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
return updated_scrolls_metric_path return updated_scrolls_metric_path
...@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc): ...@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
"input": input, "input": input,
"outputs": doc["outputs"], "outputs": doc["outputs"],
"question": input[0:split], "question": input[0:split],
"text": input[split + 2:] "text": input[split + 2 :],
} }
...@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset): ...@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
indices_to_keep = [] indices_to_keep = []
id_to_idx = {} id_to_idx = {}
outputs = [] outputs = []
for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])): for i, (id_, output) in enumerate(
zip(untokenized_dataset["id"], untokenized_dataset["output"])
):
if id_ in id_to_idx: if id_ in id_to_idx:
outputs[id_to_idx[id_]].append(output) outputs[id_to_idx[id_]].append(output)
continue continue
...@@ -119,9 +126,11 @@ def _num_cpu_cores(): ...@@ -119,9 +126,11 @@ def _num_cpu_cores():
# https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170 # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
try: try:
import psutil import psutil
return psutil.cpu_count(logical=False) return psutil.cpu_count(logical=False)
except ImportError: except ImportError:
import os import os
return len(os.sched_getaffinity(0)) return len(os.sched_getaffinity(0))
...@@ -135,7 +144,11 @@ class _SCROLLSTask(Task): ...@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):
def __init__(self, no_metric=False): def __init__(self, no_metric=False):
super().__init__() super().__init__()
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None self.metric = (
load_metric(_download_metric(), config_name=self.DATASET_NAME)
if not no_metric
else None
)
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -176,7 +189,10 @@ class _SCROLLSTask(Task): ...@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
that are less than `max_tokens` when tokenized by each tokenizer that are less than `max_tokens` when tokenized by each tokenizer
""" """
tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS] tokenizers = [
AutoTokenizer.from_pretrained(tokenizer)
for tokenizer in self.PRUNE_TOKENIZERS
]
cache = {} cache = {}
def _filter(sample): def _filter(sample):
...@@ -210,18 +226,21 @@ class _SCROLLSTask(Task): ...@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
def _make_compute_metrics(self, value): def _make_compute_metrics(self, value):
def compute_metrics(samples): def compute_metrics(samples):
predictions, references = zip(*samples) # unzip, if you will predictions, references = zip(*samples) # unzip, if you will
computed = self.metric.compute(predictions=predictions, references=references) computed = self.metric.compute(
predictions=predictions, references=references
)
return computed[value] return computed[value]
return compute_metrics return compute_metrics
def aggregation(self): def aggregation(self):
return { return {
key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items() key: self._make_compute_metrics(value)
for key, value in self._scrolls_metrics().items()
} }
class _SCROLLSMultipleChoiceTask(_SCROLLSTask): class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def __init__(self): def __init__(self):
super().__init__(no_metric=True) super().__init__(no_metric=True)
...@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask): ...@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
return None return None
def aggregation(self): def aggregation(self):
return { return {"em": mean, "acc": mean, "acc_norm": mean}
"em": mean,
"acc": mean,
"acc_norm": mean
}
def higher_is_better(self): def higher_is_better(self):
return { return {"em": True, "acc": True, "acc_norm": True}
"em": True,
"acc": True,
"acc_norm": True
}
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["gold"] gold = doc["gold"]
...@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask): ...@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
class _SCROLLSSummaryTask(_SCROLLSTask): class _SCROLLSSummaryTask(_SCROLLSTask):
def _process_doc(self, doc): def _process_doc(self, doc):
return [doc] return [doc]
def _scrolls_metrics(self): def _scrolls_metrics(self):
return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"} return {
"rouge1": "rouge/rouge1",
"rouge2": "rouge/rouge2",
"rougeL": "rouge/rougeL",
}
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {
"rouge1": (results[0], doc["outputs"]), "rouge1": (results[0], doc["outputs"]),
"rouge2": (results[0], doc["outputs"]), "rouge2": (results[0], doc["outputs"]),
"rougeL": (results[0], doc["outputs"]) "rougeL": (results[0], doc["outputs"]),
} }
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:" return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
...@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask): ...@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):
def _process_doc(self, doc): def _process_doc(self, doc):
doc = _process_doc_prepended_question(doc) doc = _process_doc_prepended_question(doc)
doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur) doc["is_yes_no"] = reduce(
in ["yes", "no"], doc["outputs"], True) lambda prev, cur: prev
and squad_metrics.normalize_answer(cur) in ["yes", "no"],
doc["outputs"],
True,
)
return [doc] return [doc]
def _scrolls_metrics(self): def _scrolls_metrics(self):
...@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask): ...@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
prediction = "Unanswerable" prediction = "Unanswerable"
else: else:
prediction = results[0] prediction = results[0]
return { return {"f1": (prediction, doc["outputs"])}
"f1": (prediction, doc["outputs"])
}
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
if doc["is_yes_no"]: if doc["is_yes_no"]:
...@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask): ...@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
ll_no, _ = rf.loglikelihood(ctx, " no") ll_no, _ = rf.loglikelihood(ctx, " no")
return [ll_yes, ll_no] return [ll_yes, ll_no]
else: else:
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
class QuALITY(_SCROLLSMultipleChoiceTask): class QuALITY(_SCROLLSMultipleChoiceTask):
...@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask): ...@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
choices_text = doc["text"][:split] choices_text = doc["text"][:split]
doc["text"] = doc["text"][split:].strip() doc["text"] = doc["text"][split:].strip()
doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split( doc["choices"] = [
QuALITY._multiple_choice_pattern, choices_text)[1:]] QuALITY._normalize_answer(choice)
for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
]
doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0])) doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
return [doc] return [doc]
...@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask): ...@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
return self._process_doc(doc)[0]["text"] return self._process_doc(doc)[0]["text"]
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {"f1": (results[0], doc["outputs"])}
"f1": (results[0], doc["outputs"])
}
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
class ContractNLI(_SCROLLSMultipleChoiceTask): class ContractNLI(_SCROLLSMultipleChoiceTask):
...@@ -439,5 +455,5 @@ def construct_tasks(): ...@@ -439,5 +455,5 @@ def construct_tasks():
"scrolls_contractnli": ContractNLI, "scrolls_contractnli": ContractNLI,
"scrolls_govreport": GovReport, "scrolls_govreport": GovReport,
"scrolls_summscreenfd": SummScreenFD, "scrolls_summscreenfd": SummScreenFD,
"scrolls_qmsum": QMSum "scrolls_qmsum": QMSum,
} }
...@@ -76,8 +76,16 @@ class TriviaQA(Task): ...@@ -76,8 +76,16 @@ class TriviaQA(Task):
return continuation return continuation
def process_results(self, doc, results): def process_results(self, doc, results):
continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation)) continuation = (
list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]] results[0]
.strip()
.lower()
.translate(str.maketrans("", "", string.punctuation))
)
list_of_candidates = [
alias.lower().translate(str.maketrans("", "", string.punctuation))
for alias in doc["answer"]["aliases"]
]
return {"em": float(continuation in list_of_candidates)} return {"em": float(continuation in list_of_candidates)}
def aggregation(self): def aggregation(self):
......
...@@ -12,17 +12,27 @@ def parse_args(): ...@@ -12,17 +12,27 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)) parser.add_argument(
"--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
)
parser.add_argument("--provide_description", action="store_true") parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None) parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None, parser.add_argument(
help="Maximal batch size to try with --batch_size auto") "--max_batch_size",
type=int,
default=None,
help="Maximal batch size to try with --batch_size auto",
)
parser.add_argument("--device", type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None) parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None, parser.add_argument(
help="Limit the number of examples per task. " "--limit",
"If <1, limit is a percentage of the total number of examples.") type=float,
default=None,
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument("--data_sampling", type=float, default=None) parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true") parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None) parser.add_argument("--decontamination_ngrams_path", default=None)
...@@ -77,7 +87,9 @@ def main(): ...@@ -77,7 +87,9 @@ def main():
print(dumped) print(dumped)
if args.output_path: if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True) dirname = os.path.dirname(args.output_path)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
......
...@@ -9,7 +9,12 @@ from lm_eval import tasks, utils ...@@ -9,7 +9,12 @@ from lm_eval import tasks, utils
seq2seq_models = ["google/flan-t5-small"] seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"] causal_models = [
"gpt2",
"facebook/opt-125m",
"EleutherAI/gpt-neo-125m",
"EleutherAI/pythia-160m",
]
model_names = seq2seq_models + causal_models model_names = seq2seq_models + causal_models
...@@ -50,22 +55,41 @@ def eval_models(args, branch=None): ...@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
results = {} results = {}
for model in args.models: for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \ model_type = (
else "hf-seq2seq" if model in seq2seq_models else args.model "hf-causal-experimental"
if model in causal_models
else "hf-seq2seq"
if model in seq2seq_models
else args.model
)
model_args = f"pretrained={model},{args.model_args}" model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527 # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \ tasks = (
args.tasks
if model in causal_models or model_type == "hf-causal-experimental"
else list(filter(lambda task: task not in perplexity_tasks, args.tasks)) else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama # TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \ batch_size = (
else 64 if args.batch_size == "auto" else args.batch_size args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json" if model in causal_models or model_type == "hf-causal-experimental"
else 64
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \ if args.batch_size == "auto"
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \ else args.batch_size
f"--batch_size {batch_size} --no_cache --output_path {output_path}" )
output_path = (
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}") f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
)
command = (
f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
f"--batch_size {batch_size} --no_cache --output_path {output_path}"
)
print(
f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
)
ret = os.system(command) ret = os.system(command)
...@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task): ...@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
def main(): def main():
args = parse_args() args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches
)
args.models = args.models.split(",") if type(args.models) == str else args.models args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \ args.tasks = (
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS) tasks.ALL_TASKS
if args.tasks == "all_tasks"
else utils.pattern_match(
args.tasks.split(",") if type(args.tasks) == str else args.tasks,
tasks.ALL_TASKS,
)
)
global initial_branch global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip() initial_branch = (
subprocess.check_output("git branch --show-current", shell=True)
.decode("ascii")
.strip()
)
# TODO: implement proper timing for each task # TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models? # TODO: reduce IO by sharing tasks between models?
...@@ -132,10 +168,16 @@ def main(): ...@@ -132,10 +168,16 @@ def main():
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}") print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks: for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|") print(
f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
)
for branch, branch_results, branch_runtime in runs: for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|") print(
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|") f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
)
print(
f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
)
print("") print("")
print("|branch|runtime|%|") print("|branch|runtime|%|")
......
...@@ -12,10 +12,8 @@ setuptools.setup( ...@@ -12,10 +12,8 @@ setuptools.setup(
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness", url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(), packages=setuptools.find_packages(exclude=["scripts.*", "scripts"]),
package_data={ package_data={"lm_eval": ["**/*.json"]},
"lm_eval": ["**/*.json"]
},
include_package_data=True, include_package_data=True,
classifiers=[ classifiers=[
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment