Unverified Commit 2c18e367 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #876 from jonabur/output_bugfix

fix bug with output path in CWD
parents 00209e10 93cbffa5
......@@ -25,6 +25,3 @@ WORKDIR /lm-evaluation-harness
RUN pip install --no-cache-dir -e .
### Run bash
CMD ["/bin/bash"]
......@@ -309,7 +309,9 @@ class BaseLM(LM):
if override_bs is not None
else 0,
fn=_batch_scheduler
if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None,
):
inps = []
......@@ -375,7 +377,9 @@ class BaseLM(LM):
# Slice to original seq length
contlen = len(cont_toks)
inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
inplen = inplen + (
logits.shape[0] - padding_length
) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
logits = logits[inplen - contlen : inplen].unsqueeze(
0
) # [1, seq, vocab]
......
......@@ -74,7 +74,12 @@ def simple_evaluate(
if model_args is None:
model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
)
elif isinstance(model, transformers.PreTrainedModel):
lm = lm_eval.models.get_model("hf-causal")(
......@@ -125,7 +130,9 @@ def simple_evaluate(
"model_args": model_args,
"num_fewshot": num_fewshot,
"batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
"batch_sizes": list(lm.batch_sizes.values())
if hasattr(lm, "batch_sizes")
else [],
"device": device,
"no_cache": no_cache,
"limit": limit,
......
......@@ -4,9 +4,7 @@ from typing import Optional, Union
from lm_eval.base import BaseLM
def _get_dtype(
dtype: Union[str, torch.dtype]
) -> torch.dtype:
def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16`
......@@ -33,11 +31,10 @@ class HFLM(BaseLM):
max_length=None,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
dtype: Optional[Union[str, torch.dtype]]="auto",
dtype: Optional[Union[str, torch.dtype]] = "auto",
):
super().__init__()
# Initialize model
if isinstance(pretrained, transformers.PreTrainedModel):
self.model = pretrained
......@@ -45,12 +42,8 @@ class HFLM(BaseLM):
if tokenizer:
assert isinstance(
tokenizer,
transformers.PreTrainedTokenizer
) or isinstance(
tokenizer,
transformers.PreTrainedTokenizerFast
)
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer
......@@ -66,7 +59,8 @@ class HFLM(BaseLM):
# Initialize device
assert isinstance(device, str)
device_list = set(
["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
)
if device and device in device_list:
self._device = torch.device(device)
......@@ -97,7 +91,9 @@ class HFLM(BaseLM):
)
else:
raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel')
raise TypeError(
"Parameter pretrained should be of type str or transformers.PreTrainedModel"
)
self.model.eval()
......@@ -136,7 +132,6 @@ class HFLM(BaseLM):
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
@property
def max_gen_toks(self):
return 256
......@@ -171,8 +166,10 @@ class HFLM(BaseLM):
def _model_generate(self, context, max_length, eos_token_id):
generation_kwargs = {"do_sample": False, "max_length": max_length}
if eos_token_id is not None:
generation_kwargs['eos_token_id'] = eos_token_id
generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
generation_kwargs["eos_token_id"] = eos_token_id
generation_kwargs[
"pad_token_id"
] = eos_token_id # setting eos_token_id as pad token
return self.model.generate(context, **generation_kwargs)
......
......@@ -266,7 +266,9 @@ class HuggingFaceAutoLM(BaseLM):
try:
self.model.to(self._device)
except:
print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
print(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
def _create_auto_model(
self,
......@@ -292,7 +294,9 @@ class HuggingFaceAutoLM(BaseLM):
"""Returns a pre-trained pytorch model from a pre-trained model configuration."""
if not quantized:
if load_in_4bit:
assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
assert (
transformers.__version__ >= "4.30.0"
), "load_in_4bit requires transformers >= 4.30.0"
model_kwargs = {}
if transformers.__version__ >= "4.30.0":
model_kwargs["load_in_4bit"] = load_in_4bit
......@@ -300,9 +304,13 @@ class HuggingFaceAutoLM(BaseLM):
if bnb_4bit_quant_type:
model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
if bnb_4bit_compute_dtype:
model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype)
model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
bnb_4bit_compute_dtype
)
if bnb_4bit_use_double_quant:
model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant
model_kwargs[
"bnb_4bit_use_double_quant"
] = bnb_4bit_use_double_quant
model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""),
......@@ -317,13 +325,16 @@ class HuggingFaceAutoLM(BaseLM):
)
else:
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
pretrained,
model_basename=None if quantized == True else Path(quantized).stem,
device_map=device_map,
max_memory=max_memory,
trust_remote_code=trust_remote_code,
use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
use_safetensors=True
if quantized == True
else quantized.endswith(".safetensors"),
use_triton=gptq_use_triton,
warmup_triton=gptq_use_triton,
inject_fused_attention=inject_fused_attention,
......
......@@ -330,11 +330,11 @@ TASK_REGISTRY = {
"csatqa_rch": csatqa.RCH,
"csatqa_li": csatqa.LI,
"haerae_hi": haerae.HI,
"haerae_kgk":haerae.KGK,
"haerae_lw":haerae.LW,
"haerae_rc":haerae.RC,
"haerae_rw":haerae.RW,
"haerae_sn":haerae.SN,
"haerae_kgk": haerae.KGK,
"haerae_lw": haerae.LW,
"haerae_rc": haerae.RC,
"haerae_rw": haerae.RW,
"haerae_sn": haerae.SN,
# Requires manual download
# Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016,
......
......@@ -16,6 +16,7 @@ _CITATION = """
}
"""
class Babi(Task):
VERSION = 0
DATASET_PATH = "Muennighoff/babi"
......@@ -43,9 +44,7 @@ class Babi(Task):
return self.dataset["test"]
def doc_to_text(self, doc):
return (
doc['passage'] + doc['question']
)
return doc["passage"] + doc["question"]
def should_decontaminate(self):
return False # TODO Necessary?
......@@ -54,7 +53,7 @@ class Babi(Task):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc):
return " " + doc['answer']
return " " + doc["answer"]
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
......
......@@ -21,58 +21,58 @@ _CITATION = """
SUBJECTS = {
"computer_network":"计算机网络",
"operating_system":"操作系统",
"computer_architecture":"计算机组成",
"college_programming":"大学编程",
"college_physics":"大学物理",
"college_chemistry":"大学化学",
"advanced_mathematics":"高等数学",
"probability_and_statistics":"概率统计",
"discrete_mathematics":"离散数学",
"electrical_engineer":"注册电气工程师",
"metrology_engineer":"注册计量师",
"high_school_mathematics":"高中数学",
"high_school_physics":"高中物理",
"high_school_chemistry":"高中化学",
"high_school_biology":"高中生物",
"middle_school_mathematics":"初中数学",
"middle_school_biology":"初中生物",
"middle_school_physics":"初中物理",
"middle_school_chemistry":"初中化学",
"veterinary_medicine":"兽医学",
"college_economics":"大学经济学",
"business_administration":"工商管理",
"marxism":"马克思主义基本原理",
"mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
"education_science":"教育学",
"teacher_qualification":"教师资格",
"high_school_politics":"高中政治",
"high_school_geography":"高中地理",
"middle_school_politics":"初中政治",
"middle_school_geography":"初中地理",
"modern_chinese_history":"近代史纲要",
"ideological_and_moral_cultivation":"思想道德修养与法律基础",
"logic":"逻辑学",
"law":"法学",
"chinese_language_and_literature":"中国语言文学",
"art_studies":"艺术学",
"professional_tour_guide":"导游资格",
"legal_professional":"法律职业资格",
"high_school_chinese":"高中语文",
"high_school_history":"高中历史",
"middle_school_history":"初中历史",
"civil_servant":"公务员",
"sports_science":"体育学",
"plant_protection":"植物保护",
"basic_medicine":"基础医学",
"clinical_medicine":"临床医学",
"urban_and_rural_planner":"注册城乡规划师",
"accountant":"注册会计师",
"fire_engineer":"注册消防工程师",
"environmental_impact_assessment_engineer":"环境影响评价工程师",
"tax_accountant":"税务师",
"physician":"医师资格"
"computer_network": "计算机网络",
"operating_system": "操作系统",
"computer_architecture": "计算机组成",
"college_programming": "大学编程",
"college_physics": "大学物理",
"college_chemistry": "大学化学",
"advanced_mathematics": "高等数学",
"probability_and_statistics": "概率统计",
"discrete_mathematics": "离散数学",
"electrical_engineer": "注册电气工程师",
"metrology_engineer": "注册计量师",
"high_school_mathematics": "高中数学",
"high_school_physics": "高中物理",
"high_school_chemistry": "高中化学",
"high_school_biology": "高中生物",
"middle_school_mathematics": "初中数学",
"middle_school_biology": "初中生物",
"middle_school_physics": "初中物理",
"middle_school_chemistry": "初中化学",
"veterinary_medicine": "兽医学",
"college_economics": "大学经济学",
"business_administration": "工商管理",
"marxism": "马克思主义基本原理",
"mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
"education_science": "教育学",
"teacher_qualification": "教师资格",
"high_school_politics": "高中政治",
"high_school_geography": "高中地理",
"middle_school_politics": "初中政治",
"middle_school_geography": "初中地理",
"modern_chinese_history": "近代史纲要",
"ideological_and_moral_cultivation": "思想道德修养与法律基础",
"logic": "逻辑学",
"law": "法学",
"chinese_language_and_literature": "中国语言文学",
"art_studies": "艺术学",
"professional_tour_guide": "导游资格",
"legal_professional": "法律职业资格",
"high_school_chinese": "高中语文",
"high_school_history": "高中历史",
"middle_school_history": "初中历史",
"civil_servant": "公务员",
"sports_science": "体育学",
"plant_protection": "植物保护",
"basic_medicine": "基础医学",
"clinical_medicine": "临床医学",
"urban_and_rural_planner": "注册城乡规划师",
"accountant": "注册会计师",
"fire_engineer": "注册消防工程师",
"environmental_impact_assessment_engineer": "环境影响评价工程师",
"tax_accountant": "税务师",
"physician": "医师资格",
}
......@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):
def validation_docs(self):
if self.has_validation_docs():
return map(self._process_doc,self.dataset["val"])
return map(self._process_doc, self.dataset["val"])
def test_docs(self):
if self.has_test_docs():
return map(self._process_doc,self.dataset["test"])
return map(self._process_doc, self.dataset["test"])
def _format_subject(self, subject):
words = subject.split("_")
......@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):
def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME
description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。"
description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。"
kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
......@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
"""
question = doc["question"].strip()
choices = "".join(
[f'{key}. {doc[key]}\n' for key in keys]
)
choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
prompt = f"{question}\n{choices}答案:"
return prompt
......@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
return {
"query": format_example(doc, keys),
"choices": keys,
"gold": ord(doc["answer"])-ord("A"),
"gold": ord(doc["answer"]) - ord("A"),
}
def fewshot_examples(self, k, rnd):
......
......@@ -32,16 +32,16 @@ SUBJECTS = {
"chinese_driving_rule": "中国驾驶规则",
"chinese_food_culture": "中国饮食文化",
"chinese_foreign_policy": "中国外交政策",
"chinese_history":"中国历史",
"chinese_history": "中国历史",
"chinese_literature": "中国文学",
"chinese_teacher_qualification": "中国教师资格",
"clinical_knowledge": "临床知识",
"college_actuarial_science":"大学精算学",
"college_education":"大学教育学",
"college_actuarial_science": "大学精算学",
"college_education": "大学教育学",
"college_engineering_hydrology": "大学工程水文学",
"college_law": "大学法律",
"college_mathematics": "大学数学",
"college_medical_statistics":"大学医学统计",
"college_medical_statistics": "大学医学统计",
"college_medicine": "大学医学",
"computer_science": "计算机科学",
"computer_security": "计算机安全",
......@@ -50,8 +50,8 @@ SUBJECTS = {
"economics": "经济学",
"education": "教育学",
"electrical_engineering": "电气工程",
"elementary_chinese":"小学语文",
"elementary_commonsense":"小学常识",
"elementary_chinese": "小学语文",
"elementary_commonsense": "小学常识",
"elementary_information_and_technology": "小学信息技术",
"elementary_mathematics": "初等数学",
"ethnology": "民族学",
......@@ -82,12 +82,12 @@ SUBJECTS = {
"professional_medicine": "专业医学",
"professional_psychology": "专业心理学",
"public_relations": "公共关系",
"security_study":"安全研究",
"security_study": "安全研究",
"sociology": "社会学",
"sports_science": "体育学",
"traditional_chinese_medicine": "中医中药",
"virology": "病毒学",
"world_history":"世界历史",
"world_history": "世界历史",
"world_religions": "世界宗教",
}
......@@ -128,11 +128,11 @@ class CmmluSubject(MultipleChoiceTask):
def validation_docs(self):
if self.has_validation_docs():
return map(self._process_doc,self.dataset["dev"])
return map(self._process_doc, self.dataset["dev"])
def test_docs(self):
if self.has_test_docs():
return map(self._process_doc,self.dataset["test"])
return map(self._process_doc, self.dataset["test"])
def _format_subject(self, subject):
words = subject.split("_")
......@@ -140,7 +140,7 @@ class CmmluSubject(MultipleChoiceTask):
def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME
description= f"以下是关于{SUBJECTS[subject]}的单项选择题,请直接给出正确答案的选项。"
description = f"以下是关于{SUBJECTS[subject]}的单项选择题,请直接给出正确答案的选项。"
kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
......@@ -156,9 +156,7 @@ class CmmluSubject(MultipleChoiceTask):
"""
question = doc["Question"].strip()
choices = "".join(
[f'{key}. {doc[key]}\n' for key in keys]
)
choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
prompt = f"{question}\n{choices}答案:"
return prompt
......@@ -166,7 +164,7 @@ class CmmluSubject(MultipleChoiceTask):
return {
"query": format_example(doc, keys),
"choices": keys,
"gold": ord(doc["Answer"])-ord("A"),
"gold": ord(doc["Answer"]) - ord("A"),
}
def fewshot_examples(self, k, rnd):
......
......@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
### Answer: 주어진 문제의 정답은"""
choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
choices = [
doc["option#1"],
doc["option#2"],
doc["option#3"],
doc["option#4"],
doc["option#5"],
]
out_doc = {
"question": instruction,
"choices": ["(1)", "(2)","(3)","(4)","(5)"],
"gold": int(doc['gold'])-1,
"choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
"gold": int(doc["gold"]) - 1,
}
return out_doc
......@@ -41,17 +47,22 @@ class CSATQA(MultipleChoiceTask):
class WR(CSATQA):
DATASET_NAME = "WR"
class GR(CSATQA):
DATASET_NAME = "GR"
class RCS(CSATQA):
DATASET_NAME = "RCS"
class RCSS(CSATQA):
DATASET_NAME = "RCSS"
class RCH(CSATQA):
DATASET_NAME = "RCH"
class LI(CSATQA):
DATASET_NAME = "LI"
......@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
out_doc = {
"query": doc["query"],
"choices": choices,
"gold": int(doc['gold'])-1,
"gold": int(doc["gold"]) - 1,
}
return out_doc
......
......@@ -86,10 +86,10 @@ class NQOpen(Task):
def _normalize_answer(self, text):
# Lowercase and remove punctuation, strip whitespace
text = text.strip().lower().translate(str.maketrans('', '', string.punctuation))
text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
# Remove articles, resulting in duplicate whitespace
text = regex.sub(r'\b(a|an|the)\b', ' ', text)
text = regex.sub(r"\b(a|an|the)\b", " ", text)
# Remove duplicate whitespace
text = " ".join(text.split())
......@@ -109,9 +109,7 @@ class NQOpen(Task):
continuation = self._normalize_answer(results[0])
answers = [self._normalize_answer(answer) for answer in doc["answer"]]
return {
"em": float(continuation in answers)
}
return {"em": float(continuation in answers)}
def aggregation(self):
"""
......
......@@ -72,9 +72,14 @@ def _download_metric():
import os
import shutil
from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
scrolls_metric_path = hf_hub_download(
repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
)
updated_scrolls_metric_path = (
os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
os.path.dirname(scrolls_metric_path)
+ os.path.basename(scrolls_metric_path).replace(".", "_")
+ ".py"
)
shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
return updated_scrolls_metric_path
......@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
"input": input,
"outputs": doc["outputs"],
"question": input[0:split],
"text": input[split + 2:]
"text": input[split + 2 :],
}
......@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
indices_to_keep = []
id_to_idx = {}
outputs = []
for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])):
for i, (id_, output) in enumerate(
zip(untokenized_dataset["id"], untokenized_dataset["output"])
):
if id_ in id_to_idx:
outputs[id_to_idx[id_]].append(output)
continue
......@@ -119,9 +126,11 @@ def _num_cpu_cores():
# https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
try:
import psutil
return psutil.cpu_count(logical=False)
except ImportError:
import os
return len(os.sched_getaffinity(0))
......@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):
def __init__(self, no_metric=False):
super().__init__()
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None
self.metric = (
load_metric(_download_metric(), config_name=self.DATASET_NAME)
if not no_metric
else None
)
def has_training_docs(self):
return True
......@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
that are less than `max_tokens` when tokenized by each tokenizer
"""
tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS]
tokenizers = [
AutoTokenizer.from_pretrained(tokenizer)
for tokenizer in self.PRUNE_TOKENIZERS
]
cache = {}
def _filter(sample):
......@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
def _make_compute_metrics(self, value):
def compute_metrics(samples):
predictions, references = zip(*samples) # unzip, if you will
computed = self.metric.compute(predictions=predictions, references=references)
computed = self.metric.compute(
predictions=predictions, references=references
)
return computed[value]
return compute_metrics
def aggregation(self):
return {
key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items()
key: self._make_compute_metrics(value)
for key, value in self._scrolls_metrics().items()
}
class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def __init__(self):
super().__init__(no_metric=True)
......@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
return None
def aggregation(self):
return {
"em": mean,
"acc": mean,
"acc_norm": mean
}
return {"em": mean, "acc": mean, "acc_norm": mean}
def higher_is_better(self):
return {
"em": True,
"acc": True,
"acc_norm": True
}
return {"em": True, "acc": True, "acc_norm": True}
def process_results(self, doc, results):
gold = doc["gold"]
......@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
class _SCROLLSSummaryTask(_SCROLLSTask):
def _process_doc(self, doc):
return [doc]
def _scrolls_metrics(self):
return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"}
return {
"rouge1": "rouge/rouge1",
"rouge2": "rouge/rouge2",
"rougeL": "rouge/rougeL",
}
def process_results(self, doc, results):
return {
"rouge1": (results[0], doc["outputs"]),
"rouge2": (results[0], doc["outputs"]),
"rougeL": (results[0], doc["outputs"])
"rougeL": (results[0], doc["outputs"]),
}
def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})]
return [rf.greedy_until(ctx, {"until": ["\n"]})]
def doc_to_text(self, doc):
return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
......@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):
def _process_doc(self, doc):
doc = _process_doc_prepended_question(doc)
doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur)
in ["yes", "no"], doc["outputs"], True)
doc["is_yes_no"] = reduce(
lambda prev, cur: prev
and squad_metrics.normalize_answer(cur) in ["yes", "no"],
doc["outputs"],
True,
)
return [doc]
def _scrolls_metrics(self):
......@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
prediction = "Unanswerable"
else:
prediction = results[0]
return {
"f1": (prediction, doc["outputs"])
}
return {"f1": (prediction, doc["outputs"])}
def construct_requests(self, doc, ctx):
if doc["is_yes_no"]:
......@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
ll_no, _ = rf.loglikelihood(ctx, " no")
return [ll_yes, ll_no]
else:
return [rf.greedy_until(ctx, {'until': ["\n"]})]
return [rf.greedy_until(ctx, {"until": ["\n"]})]
class QuALITY(_SCROLLSMultipleChoiceTask):
......@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
choices_text = doc["text"][:split]
doc["text"] = doc["text"][split:].strip()
doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split(
QuALITY._multiple_choice_pattern, choices_text)[1:]]
doc["choices"] = [
QuALITY._normalize_answer(choice)
for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
]
doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
return [doc]
......@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
return self._process_doc(doc)[0]["text"]
def process_results(self, doc, results):
return {
"f1": (results[0], doc["outputs"])
}
return {"f1": (results[0], doc["outputs"])}
def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})]
return [rf.greedy_until(ctx, {"until": ["\n"]})]
class ContractNLI(_SCROLLSMultipleChoiceTask):
......@@ -439,5 +455,5 @@ def construct_tasks():
"scrolls_contractnli": ContractNLI,
"scrolls_govreport": GovReport,
"scrolls_summscreenfd": SummScreenFD,
"scrolls_qmsum": QMSum
"scrolls_qmsum": QMSum,
}
......@@ -76,8 +76,16 @@ class TriviaQA(Task):
return continuation
def process_results(self, doc, results):
continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]]
continuation = (
results[0]
.strip()
.lower()
.translate(str.maketrans("", "", string.punctuation))
)
list_of_candidates = [
alias.lower().translate(str.maketrans("", "", string.punctuation))
for alias in doc["answer"]["aliases"]
]
return {"em": float(continuation in list_of_candidates)}
def aggregation(self):
......
......@@ -12,17 +12,27 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
parser.add_argument(
"--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
)
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument(
"--max_batch_size",
type=int,
default=None,
help="Maximal batch size to try with --batch_size auto",
)
parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None,
parser.add_argument(
"--limit",
type=float,
default=None,
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.")
"If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None)
......@@ -77,7 +87,9 @@ def main():
print(dumped)
if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
dirname = os.path.dirname(args.output_path)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(args.output_path, "w") as f:
f.write(dumped)
......
......@@ -9,7 +9,12 @@ from lm_eval import tasks, utils
seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
causal_models = [
"gpt2",
"facebook/opt-125m",
"EleutherAI/gpt-neo-125m",
"EleutherAI/pythia-160m",
]
model_names = seq2seq_models + causal_models
......@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
results = {}
for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \
else "hf-seq2seq" if model in seq2seq_models else args.model
model_type = (
"hf-causal-experimental"
if model in causal_models
else "hf-seq2seq"
if model in seq2seq_models
else args.model
)
model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
tasks = (
args.tasks
if model in causal_models or model_type == "hf-causal-experimental"
else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
else 64 if args.batch_size == "auto" else args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
batch_size = (
args.batch_size
if model in causal_models or model_type == "hf-causal-experimental"
else 64
if args.batch_size == "auto"
else args.batch_size
)
output_path = (
f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
)
command = (
f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
f"--batch_size {batch_size} --no_cache --output_path {output_path}"
)
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
print(
f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
)
ret = os.system(command)
......@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
def main():
args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches
)
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
args.tasks = (
tasks.ALL_TASKS
if args.tasks == "all_tasks"
else utils.pattern_match(
args.tasks.split(",") if type(args.tasks) == str else args.tasks,
tasks.ALL_TASKS,
)
)
global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
initial_branch = (
subprocess.check_output("git branch --show-current", shell=True)
.decode("ascii")
.strip()
)
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
......@@ -132,10 +168,16 @@ def main():
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
print(
f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
)
for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
print(
f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
)
print(
f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
)
print("")
print("|branch|runtime|%|")
......
......@@ -13,9 +13,7 @@ setuptools.setup(
long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(),
package_data={
"lm_eval": ["**/*.json"]
},
package_data={"lm_eval": ["**/*.json"]},
include_package_data=True,
classifiers=[
"Development Status :: 3 - Alpha",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment