Unverified Commit 2c18e367 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #876 from jonabur/output_bugfix

fix bug with output path in CWD
parents 00209e10 93cbffa5
...@@ -25,6 +25,3 @@ WORKDIR /lm-evaluation-harness ...@@ -25,6 +25,3 @@ WORKDIR /lm-evaluation-harness
RUN pip install --no-cache-dir -e . RUN pip install --no-cache-dir -e .
### Run bash ### Run bash
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -309,7 +309,9 @@ class BaseLM(LM): ...@@ -309,7 +309,9 @@ class BaseLM(LM):
if override_bs is not None if override_bs is not None
else 0, else 0,
fn=_batch_scheduler fn=_batch_scheduler
if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None, else None,
): ):
inps = [] inps = []
...@@ -375,7 +377,9 @@ class BaseLM(LM): ...@@ -375,7 +377,9 @@ class BaseLM(LM):
# Slice to original seq length # Slice to original seq length
contlen = len(cont_toks) contlen = len(cont_toks)
inplen = inplen + (logits.shape[0] - padding_length) # if "virtual tokens" (from prompt tuning) are added, inplen is larger inplen = inplen + (
logits.shape[0] - padding_length
) # if "virtual tokens" (from prompt tuning) are added, inplen is larger
logits = logits[inplen - contlen : inplen].unsqueeze( logits = logits[inplen - contlen : inplen].unsqueeze(
0 0
) # [1, seq, vocab] ) # [1, seq, vocab]
......
...@@ -74,7 +74,12 @@ def simple_evaluate( ...@@ -74,7 +74,12 @@ def simple_evaluate(
if model_args is None: if model_args is None:
model_args = "" model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string( lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device} model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
) )
elif isinstance(model, transformers.PreTrainedModel): elif isinstance(model, transformers.PreTrainedModel):
lm = lm_eval.models.get_model("hf-causal")( lm = lm_eval.models.get_model("hf-causal")(
...@@ -125,7 +130,9 @@ def simple_evaluate( ...@@ -125,7 +130,9 @@ def simple_evaluate(
"model_args": model_args, "model_args": model_args,
"num_fewshot": num_fewshot, "num_fewshot": num_fewshot,
"batch_size": batch_size, "batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [], "batch_sizes": list(lm.batch_sizes.values())
if hasattr(lm, "batch_sizes")
else [],
"device": device, "device": device,
"no_cache": no_cache, "no_cache": no_cache,
"limit": limit, "limit": limit,
......
...@@ -4,9 +4,7 @@ from typing import Optional, Union ...@@ -4,9 +4,7 @@ from typing import Optional, Union
from lm_eval.base import BaseLM from lm_eval.base import BaseLM
def _get_dtype( def _get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
dtype: Union[str, torch.dtype]
) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto": if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16` # Convert `str` args torch dtype: `float16` -> `torch.float16`
...@@ -33,11 +31,10 @@ class HFLM(BaseLM): ...@@ -33,11 +31,10 @@ class HFLM(BaseLM):
max_length=None, max_length=None,
load_in_8bit: Optional[bool] = False, load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
dtype: Optional[Union[str, torch.dtype]]="auto", dtype: Optional[Union[str, torch.dtype]] = "auto",
): ):
super().__init__() super().__init__()
# Initialize model # Initialize model
if isinstance(pretrained, transformers.PreTrainedModel): if isinstance(pretrained, transformers.PreTrainedModel):
self.model = pretrained self.model = pretrained
...@@ -45,12 +42,8 @@ class HFLM(BaseLM): ...@@ -45,12 +42,8 @@ class HFLM(BaseLM):
if tokenizer: if tokenizer:
assert isinstance( assert isinstance(
tokenizer, tokenizer, transformers.PreTrainedTokenizer
transformers.PreTrainedTokenizer ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
) or isinstance(
tokenizer,
transformers.PreTrainedTokenizerFast
)
self.tokenizer = tokenizer self.tokenizer = tokenizer
else: else:
# Get tokenizer # Get tokenizer
...@@ -66,7 +59,8 @@ class HFLM(BaseLM): ...@@ -66,7 +59,8 @@ class HFLM(BaseLM):
# Initialize device # Initialize device
assert isinstance(device, str) assert isinstance(device, str)
device_list = set( device_list = set(
["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())] ["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
) )
if device and device in device_list: if device and device in device_list:
self._device = torch.device(device) self._device = torch.device(device)
...@@ -97,7 +91,9 @@ class HFLM(BaseLM): ...@@ -97,7 +91,9 @@ class HFLM(BaseLM):
) )
else: else:
raise TypeError('Parameter pretrained should be of type str or transformers.PreTrainedModel') raise TypeError(
"Parameter pretrained should be of type str or transformers.PreTrainedModel"
)
self.model.eval() self.model.eval()
...@@ -136,7 +132,6 @@ class HFLM(BaseLM): ...@@ -136,7 +132,6 @@ class HFLM(BaseLM):
return self.tokenizer.model_max_length return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
@property @property
def max_gen_toks(self): def max_gen_toks(self):
return 256 return 256
...@@ -171,8 +166,10 @@ class HFLM(BaseLM): ...@@ -171,8 +166,10 @@ class HFLM(BaseLM):
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
generation_kwargs = {"do_sample": False, "max_length": max_length} generation_kwargs = {"do_sample": False, "max_length": max_length}
if eos_token_id is not None: if eos_token_id is not None:
generation_kwargs['eos_token_id'] = eos_token_id generation_kwargs["eos_token_id"] = eos_token_id
generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token generation_kwargs[
"pad_token_id"
] = eos_token_id # setting eos_token_id as pad token
return self.model.generate(context, **generation_kwargs) return self.model.generate(context, **generation_kwargs)
......
...@@ -266,7 +266,9 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -266,7 +266,9 @@ class HuggingFaceAutoLM(BaseLM):
try: try:
self.model.to(self._device) self.model.to(self._device)
except: except:
print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.") print(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
def _create_auto_model( def _create_auto_model(
self, self,
...@@ -292,7 +294,9 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -292,7 +294,9 @@ class HuggingFaceAutoLM(BaseLM):
"""Returns a pre-trained pytorch model from a pre-trained model configuration.""" """Returns a pre-trained pytorch model from a pre-trained model configuration."""
if not quantized: if not quantized:
if load_in_4bit: if load_in_4bit:
assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0" assert (
transformers.__version__ >= "4.30.0"
), "load_in_4bit requires transformers >= 4.30.0"
model_kwargs = {} model_kwargs = {}
if transformers.__version__ >= "4.30.0": if transformers.__version__ >= "4.30.0":
model_kwargs["load_in_4bit"] = load_in_4bit model_kwargs["load_in_4bit"] = load_in_4bit
...@@ -300,9 +304,13 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -300,9 +304,13 @@ class HuggingFaceAutoLM(BaseLM):
if bnb_4bit_quant_type: if bnb_4bit_quant_type:
model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
if bnb_4bit_compute_dtype: if bnb_4bit_compute_dtype:
model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype) model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(
bnb_4bit_compute_dtype
)
if bnb_4bit_use_double_quant: if bnb_4bit_use_double_quant:
model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant model_kwargs[
"bnb_4bit_use_double_quant"
] = bnb_4bit_use_double_quant
model = self.AUTO_MODEL_CLASS.from_pretrained( model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""), revision=revision + ("/" + subfolder if subfolder is not None else ""),
...@@ -317,13 +325,16 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -317,13 +325,16 @@ class HuggingFaceAutoLM(BaseLM):
) )
else: else:
from auto_gptq import AutoGPTQForCausalLM from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized( model = AutoGPTQForCausalLM.from_quantized(
pretrained, pretrained,
model_basename=None if quantized == True else Path(quantized).stem, model_basename=None if quantized == True else Path(quantized).stem,
device_map=device_map, device_map=device_map,
max_memory=max_memory, max_memory=max_memory,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
use_safetensors=True if quantized == True else quantized.endswith('.safetensors'), use_safetensors=True
if quantized == True
else quantized.endswith(".safetensors"),
use_triton=gptq_use_triton, use_triton=gptq_use_triton,
warmup_triton=gptq_use_triton, warmup_triton=gptq_use_triton,
inject_fused_attention=inject_fused_attention, inject_fused_attention=inject_fused_attention,
......
...@@ -330,11 +330,11 @@ TASK_REGISTRY = { ...@@ -330,11 +330,11 @@ TASK_REGISTRY = {
"csatqa_rch": csatqa.RCH, "csatqa_rch": csatqa.RCH,
"csatqa_li": csatqa.LI, "csatqa_li": csatqa.LI,
"haerae_hi": haerae.HI, "haerae_hi": haerae.HI,
"haerae_kgk":haerae.KGK, "haerae_kgk": haerae.KGK,
"haerae_lw":haerae.LW, "haerae_lw": haerae.LW,
"haerae_rc":haerae.RC, "haerae_rc": haerae.RC,
"haerae_rw":haerae.RW, "haerae_rw": haerae.RW,
"haerae_sn":haerae.SN, "haerae_sn": haerae.SN,
# Requires manual download # Requires manual download
# Requires manual download of data. # Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016, # "storycloze_2016": storycloze.StoryCloze2016,
......
...@@ -16,6 +16,7 @@ _CITATION = """ ...@@ -16,6 +16,7 @@ _CITATION = """
} }
""" """
class Babi(Task): class Babi(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "Muennighoff/babi" DATASET_PATH = "Muennighoff/babi"
...@@ -43,9 +44,7 @@ class Babi(Task): ...@@ -43,9 +44,7 @@ class Babi(Task):
return self.dataset["test"] return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ( return doc["passage"] + doc["question"]
doc['passage'] + doc['question']
)
def should_decontaminate(self): def should_decontaminate(self):
return False # TODO Necessary? return False # TODO Necessary?
...@@ -54,7 +53,7 @@ class Babi(Task): ...@@ -54,7 +53,7 @@ class Babi(Task):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['answer'] return " " + doc["answer"]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
......
...@@ -21,58 +21,58 @@ _CITATION = """ ...@@ -21,58 +21,58 @@ _CITATION = """
SUBJECTS = { SUBJECTS = {
"computer_network":"计算机网络", "computer_network": "计算机网络",
"operating_system":"操作系统", "operating_system": "操作系统",
"computer_architecture":"计算机组成", "computer_architecture": "计算机组成",
"college_programming":"大学编程", "college_programming": "大学编程",
"college_physics":"大学物理", "college_physics": "大学物理",
"college_chemistry":"大学化学", "college_chemistry": "大学化学",
"advanced_mathematics":"高等数学", "advanced_mathematics": "高等数学",
"probability_and_statistics":"概率统计", "probability_and_statistics": "概率统计",
"discrete_mathematics":"离散数学", "discrete_mathematics": "离散数学",
"electrical_engineer":"注册电气工程师", "electrical_engineer": "注册电气工程师",
"metrology_engineer":"注册计量师", "metrology_engineer": "注册计量师",
"high_school_mathematics":"高中数学", "high_school_mathematics": "高中数学",
"high_school_physics":"高中物理", "high_school_physics": "高中物理",
"high_school_chemistry":"高中化学", "high_school_chemistry": "高中化学",
"high_school_biology":"高中生物", "high_school_biology": "高中生物",
"middle_school_mathematics":"初中数学", "middle_school_mathematics": "初中数学",
"middle_school_biology":"初中生物", "middle_school_biology": "初中生物",
"middle_school_physics":"初中物理", "middle_school_physics": "初中物理",
"middle_school_chemistry":"初中化学", "middle_school_chemistry": "初中化学",
"veterinary_medicine":"兽医学", "veterinary_medicine": "兽医学",
"college_economics":"大学经济学", "college_economics": "大学经济学",
"business_administration":"工商管理", "business_administration": "工商管理",
"marxism":"马克思主义基本原理", "marxism": "马克思主义基本原理",
"mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论", "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
"education_science":"教育学", "education_science": "教育学",
"teacher_qualification":"教师资格", "teacher_qualification": "教师资格",
"high_school_politics":"高中政治", "high_school_politics": "高中政治",
"high_school_geography":"高中地理", "high_school_geography": "高中地理",
"middle_school_politics":"初中政治", "middle_school_politics": "初中政治",
"middle_school_geography":"初中地理", "middle_school_geography": "初中地理",
"modern_chinese_history":"近代史纲要", "modern_chinese_history": "近代史纲要",
"ideological_and_moral_cultivation":"思想道德修养与法律基础", "ideological_and_moral_cultivation": "思想道德修养与法律基础",
"logic":"逻辑学", "logic": "逻辑学",
"law":"法学", "law": "法学",
"chinese_language_and_literature":"中国语言文学", "chinese_language_and_literature": "中国语言文学",
"art_studies":"艺术学", "art_studies": "艺术学",
"professional_tour_guide":"导游资格", "professional_tour_guide": "导游资格",
"legal_professional":"法律职业资格", "legal_professional": "法律职业资格",
"high_school_chinese":"高中语文", "high_school_chinese": "高中语文",
"high_school_history":"高中历史", "high_school_history": "高中历史",
"middle_school_history":"初中历史", "middle_school_history": "初中历史",
"civil_servant":"公务员", "civil_servant": "公务员",
"sports_science":"体育学", "sports_science": "体育学",
"plant_protection":"植物保护", "plant_protection": "植物保护",
"basic_medicine":"基础医学", "basic_medicine": "基础医学",
"clinical_medicine":"临床医学", "clinical_medicine": "临床医学",
"urban_and_rural_planner":"注册城乡规划师", "urban_and_rural_planner": "注册城乡规划师",
"accountant":"注册会计师", "accountant": "注册会计师",
"fire_engineer":"注册消防工程师", "fire_engineer": "注册消防工程师",
"environmental_impact_assessment_engineer":"环境影响评价工程师", "environmental_impact_assessment_engineer": "环境影响评价工程师",
"tax_accountant":"税务师", "tax_accountant": "税务师",
"physician":"医师资格" "physician": "医师资格",
} }
...@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask): ...@@ -112,11 +112,11 @@ class CevalSubject(MultipleChoiceTask):
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): if self.has_validation_docs():
return map(self._process_doc,self.dataset["val"]) return map(self._process_doc, self.dataset["val"])
def test_docs(self): def test_docs(self):
if self.has_test_docs(): if self.has_test_docs():
return map(self._process_doc,self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _format_subject(self, subject): def _format_subject(self, subject):
words = subject.split("_") words = subject.split("_")
...@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -124,7 +124,7 @@ class CevalSubject(MultipleChoiceTask):
def fewshot_context(self, doc, num_fewshot, **kwargs): def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME subject = self.DATASET_NAME
description= f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。" description = f"以下是中国关于{SUBJECTS[subject]}的单项选择题,请选出其中的正确答案。"
kwargs["description"] = description kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs) return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
...@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -140,9 +140,7 @@ class CevalSubject(MultipleChoiceTask):
""" """
question = doc["question"].strip() question = doc["question"].strip()
choices = "".join( choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
[f'{key}. {doc[key]}\n' for key in keys]
)
prompt = f"{question}\n{choices}答案:" prompt = f"{question}\n{choices}答案:"
return prompt return prompt
...@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask): ...@@ -150,7 +148,7 @@ class CevalSubject(MultipleChoiceTask):
return { return {
"query": format_example(doc, keys), "query": format_example(doc, keys),
"choices": keys, "choices": keys,
"gold": ord(doc["answer"])-ord("A"), "gold": ord(doc["answer"]) - ord("A"),
} }
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
......
...@@ -32,16 +32,16 @@ SUBJECTS = { ...@@ -32,16 +32,16 @@ SUBJECTS = {
"chinese_driving_rule": "中国驾驶规则", "chinese_driving_rule": "中国驾驶规则",
"chinese_food_culture": "中国饮食文化", "chinese_food_culture": "中国饮食文化",
"chinese_foreign_policy": "中国外交政策", "chinese_foreign_policy": "中国外交政策",
"chinese_history":"中国历史", "chinese_history": "中国历史",
"chinese_literature": "中国文学", "chinese_literature": "中国文学",
"chinese_teacher_qualification": "中国教师资格", "chinese_teacher_qualification": "中国教师资格",
"clinical_knowledge": "临床知识", "clinical_knowledge": "临床知识",
"college_actuarial_science":"大学精算学", "college_actuarial_science": "大学精算学",
"college_education":"大学教育学", "college_education": "大学教育学",
"college_engineering_hydrology": "大学工程水文学", "college_engineering_hydrology": "大学工程水文学",
"college_law": "大学法律", "college_law": "大学法律",
"college_mathematics": "大学数学", "college_mathematics": "大学数学",
"college_medical_statistics":"大学医学统计", "college_medical_statistics": "大学医学统计",
"college_medicine": "大学医学", "college_medicine": "大学医学",
"computer_science": "计算机科学", "computer_science": "计算机科学",
"computer_security": "计算机安全", "computer_security": "计算机安全",
...@@ -50,8 +50,8 @@ SUBJECTS = { ...@@ -50,8 +50,8 @@ SUBJECTS = {
"economics": "经济学", "economics": "经济学",
"education": "教育学", "education": "教育学",
"electrical_engineering": "电气工程", "electrical_engineering": "电气工程",
"elementary_chinese":"小学语文", "elementary_chinese": "小学语文",
"elementary_commonsense":"小学常识", "elementary_commonsense": "小学常识",
"elementary_information_and_technology": "小学信息技术", "elementary_information_and_technology": "小学信息技术",
"elementary_mathematics": "初等数学", "elementary_mathematics": "初等数学",
"ethnology": "民族学", "ethnology": "民族学",
...@@ -82,12 +82,12 @@ SUBJECTS = { ...@@ -82,12 +82,12 @@ SUBJECTS = {
"professional_medicine": "专业医学", "professional_medicine": "专业医学",
"professional_psychology": "专业心理学", "professional_psychology": "专业心理学",
"public_relations": "公共关系", "public_relations": "公共关系",
"security_study":"安全研究", "security_study": "安全研究",
"sociology": "社会学", "sociology": "社会学",
"sports_science": "体育学", "sports_science": "体育学",
"traditional_chinese_medicine": "中医中药", "traditional_chinese_medicine": "中医中药",
"virology": "病毒学", "virology": "病毒学",
"world_history":"世界历史", "world_history": "世界历史",
"world_religions": "世界宗教", "world_religions": "世界宗教",
} }
...@@ -128,11 +128,11 @@ class CmmluSubject(MultipleChoiceTask): ...@@ -128,11 +128,11 @@ class CmmluSubject(MultipleChoiceTask):
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): if self.has_validation_docs():
return map(self._process_doc,self.dataset["dev"]) return map(self._process_doc, self.dataset["dev"])
def test_docs(self): def test_docs(self):
if self.has_test_docs(): if self.has_test_docs():
return map(self._process_doc,self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _format_subject(self, subject): def _format_subject(self, subject):
words = subject.split("_") words = subject.split("_")
...@@ -140,7 +140,7 @@ class CmmluSubject(MultipleChoiceTask): ...@@ -140,7 +140,7 @@ class CmmluSubject(MultipleChoiceTask):
def fewshot_context(self, doc, num_fewshot, **kwargs): def fewshot_context(self, doc, num_fewshot, **kwargs):
subject = self.DATASET_NAME subject = self.DATASET_NAME
description= f"以下是关于{SUBJECTS[subject]}的单项选择题,请直接给出正确答案的选项。" description = f"以下是关于{SUBJECTS[subject]}的单项选择题,请直接给出正确答案的选项。"
kwargs["description"] = description kwargs["description"] = description
return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs) return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)
...@@ -156,9 +156,7 @@ class CmmluSubject(MultipleChoiceTask): ...@@ -156,9 +156,7 @@ class CmmluSubject(MultipleChoiceTask):
""" """
question = doc["Question"].strip() question = doc["Question"].strip()
choices = "".join( choices = "".join([f"{key}. {doc[key]}\n" for key in keys])
[f'{key}. {doc[key]}\n' for key in keys]
)
prompt = f"{question}\n{choices}答案:" prompt = f"{question}\n{choices}答案:"
return prompt return prompt
...@@ -166,7 +164,7 @@ class CmmluSubject(MultipleChoiceTask): ...@@ -166,7 +164,7 @@ class CmmluSubject(MultipleChoiceTask):
return { return {
"query": format_example(doc, keys), "query": format_example(doc, keys),
"choices": keys, "choices": keys,
"gold": ord(doc["Answer"])-ord("A"), "gold": ord(doc["Answer"]) - ord("A"),
} }
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
......
...@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask): ...@@ -25,11 +25,17 @@ class CSATQA(MultipleChoiceTask):
(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']} (1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
### Answer: 주어진 문제의 정답은""" ### Answer: 주어진 문제의 정답은"""
choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]] choices = [
doc["option#1"],
doc["option#2"],
doc["option#3"],
doc["option#4"],
doc["option#5"],
]
out_doc = { out_doc = {
"question": instruction, "question": instruction,
"choices": ["(1)", "(2)","(3)","(4)","(5)"], "choices": ["(1)", "(2)", "(3)", "(4)", "(5)"],
"gold": int(doc['gold'])-1, "gold": int(doc["gold"]) - 1,
} }
return out_doc return out_doc
...@@ -41,17 +47,22 @@ class CSATQA(MultipleChoiceTask): ...@@ -41,17 +47,22 @@ class CSATQA(MultipleChoiceTask):
class WR(CSATQA): class WR(CSATQA):
DATASET_NAME = "WR" DATASET_NAME = "WR"
class GR(CSATQA): class GR(CSATQA):
DATASET_NAME = "GR" DATASET_NAME = "GR"
class RCS(CSATQA): class RCS(CSATQA):
DATASET_NAME = "RCS" DATASET_NAME = "RCS"
class RCSS(CSATQA): class RCSS(CSATQA):
DATASET_NAME = "RCSS" DATASET_NAME = "RCSS"
class RCH(CSATQA): class RCH(CSATQA):
DATASET_NAME = "RCH" DATASET_NAME = "RCH"
class LI(CSATQA): class LI(CSATQA):
DATASET_NAME = "LI" DATASET_NAME = "LI"
...@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask): ...@@ -24,7 +24,7 @@ class Haerae(MultipleChoiceTask):
out_doc = { out_doc = {
"query": doc["query"], "query": doc["query"],
"choices": choices, "choices": choices,
"gold": int(doc['gold'])-1, "gold": int(doc["gold"]) - 1,
} }
return out_doc return out_doc
......
...@@ -86,10 +86,10 @@ class NQOpen(Task): ...@@ -86,10 +86,10 @@ class NQOpen(Task):
def _normalize_answer(self, text): def _normalize_answer(self, text):
# Lowercase and remove punctuation, strip whitespace # Lowercase and remove punctuation, strip whitespace
text = text.strip().lower().translate(str.maketrans('', '', string.punctuation)) text = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
# Remove articles, resulting in duplicate whitespace # Remove articles, resulting in duplicate whitespace
text = regex.sub(r'\b(a|an|the)\b', ' ', text) text = regex.sub(r"\b(a|an|the)\b", " ", text)
# Remove duplicate whitespace # Remove duplicate whitespace
text = " ".join(text.split()) text = " ".join(text.split())
...@@ -109,9 +109,7 @@ class NQOpen(Task): ...@@ -109,9 +109,7 @@ class NQOpen(Task):
continuation = self._normalize_answer(results[0]) continuation = self._normalize_answer(results[0])
answers = [self._normalize_answer(answer) for answer in doc["answer"]] answers = [self._normalize_answer(answer) for answer in doc["answer"]]
return { return {"em": float(continuation in answers)}
"em": float(continuation in answers)
}
def aggregation(self): def aggregation(self):
""" """
......
...@@ -72,9 +72,14 @@ def _download_metric(): ...@@ -72,9 +72,14 @@ def _download_metric():
import os import os
import shutil import shutil
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py")
scrolls_metric_path = hf_hub_download(
repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
)
updated_scrolls_metric_path = ( updated_scrolls_metric_path = (
os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py" os.path.dirname(scrolls_metric_path)
+ os.path.basename(scrolls_metric_path).replace(".", "_")
+ ".py"
) )
shutil.copy(scrolls_metric_path, updated_scrolls_metric_path) shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
return updated_scrolls_metric_path return updated_scrolls_metric_path
...@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc): ...@@ -92,7 +97,7 @@ def _process_doc_prepended_question(doc):
"input": input, "input": input,
"outputs": doc["outputs"], "outputs": doc["outputs"],
"question": input[0:split], "question": input[0:split],
"text": input[split + 2:] "text": input[split + 2 :],
} }
...@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset): ...@@ -102,7 +107,9 @@ def _drop_duplicates_in_input(untokenized_dataset):
indices_to_keep = [] indices_to_keep = []
id_to_idx = {} id_to_idx = {}
outputs = [] outputs = []
for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])): for i, (id_, output) in enumerate(
zip(untokenized_dataset["id"], untokenized_dataset["output"])
):
if id_ in id_to_idx: if id_ in id_to_idx:
outputs[id_to_idx[id_]].append(output) outputs[id_to_idx[id_]].append(output)
continue continue
...@@ -119,9 +126,11 @@ def _num_cpu_cores(): ...@@ -119,9 +126,11 @@ def _num_cpu_cores():
# https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170 # https://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-using-python/55423170#55423170
try: try:
import psutil import psutil
return psutil.cpu_count(logical=False) return psutil.cpu_count(logical=False)
except ImportError: except ImportError:
import os import os
return len(os.sched_getaffinity(0)) return len(os.sched_getaffinity(0))
...@@ -135,7 +144,11 @@ class _SCROLLSTask(Task): ...@@ -135,7 +144,11 @@ class _SCROLLSTask(Task):
def __init__(self, no_metric=False): def __init__(self, no_metric=False):
super().__init__() super().__init__()
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) if not no_metric else None self.metric = (
load_metric(_download_metric(), config_name=self.DATASET_NAME)
if not no_metric
else None
)
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -176,7 +189,10 @@ class _SCROLLSTask(Task): ...@@ -176,7 +189,10 @@ class _SCROLLSTask(Task):
that are less than `max_tokens` when tokenized by each tokenizer that are less than `max_tokens` when tokenized by each tokenizer
""" """
tokenizers = [AutoTokenizer.from_pretrained(tokenizer) for tokenizer in self.PRUNE_TOKENIZERS] tokenizers = [
AutoTokenizer.from_pretrained(tokenizer)
for tokenizer in self.PRUNE_TOKENIZERS
]
cache = {} cache = {}
def _filter(sample): def _filter(sample):
...@@ -210,18 +226,21 @@ class _SCROLLSTask(Task): ...@@ -210,18 +226,21 @@ class _SCROLLSTask(Task):
def _make_compute_metrics(self, value): def _make_compute_metrics(self, value):
def compute_metrics(samples): def compute_metrics(samples):
predictions, references = zip(*samples) # unzip, if you will predictions, references = zip(*samples) # unzip, if you will
computed = self.metric.compute(predictions=predictions, references=references) computed = self.metric.compute(
predictions=predictions, references=references
)
return computed[value] return computed[value]
return compute_metrics return compute_metrics
def aggregation(self): def aggregation(self):
return { return {
key: self._make_compute_metrics(value) for key, value in self._scrolls_metrics().items() key: self._make_compute_metrics(value)
for key, value in self._scrolls_metrics().items()
} }
class _SCROLLSMultipleChoiceTask(_SCROLLSTask): class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def __init__(self): def __init__(self):
super().__init__(no_metric=True) super().__init__(no_metric=True)
...@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask): ...@@ -229,18 +248,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
return None return None
def aggregation(self): def aggregation(self):
return { return {"em": mean, "acc": mean, "acc_norm": mean}
"em": mean,
"acc": mean,
"acc_norm": mean
}
def higher_is_better(self): def higher_is_better(self):
return { return {"em": True, "acc": True, "acc_norm": True}
"em": True,
"acc": True,
"acc_norm": True
}
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["gold"] gold = doc["gold"]
...@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask): ...@@ -264,22 +275,25 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
class _SCROLLSSummaryTask(_SCROLLSTask): class _SCROLLSSummaryTask(_SCROLLSTask):
def _process_doc(self, doc): def _process_doc(self, doc):
return [doc] return [doc]
def _scrolls_metrics(self): def _scrolls_metrics(self):
return {"rouge1": "rouge/rouge1", "rouge2": "rouge/rouge2", "rougeL": "rouge/rougeL"} return {
"rouge1": "rouge/rouge1",
"rouge2": "rouge/rouge2",
"rougeL": "rouge/rougeL",
}
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {
"rouge1": (results[0], doc["outputs"]), "rouge1": (results[0], doc["outputs"]),
"rouge2": (results[0], doc["outputs"]), "rouge2": (results[0], doc["outputs"]),
"rougeL": (results[0], doc["outputs"]) "rougeL": (results[0], doc["outputs"]),
} }
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:" return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
...@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask): ...@@ -294,8 +308,12 @@ class Qasper(_SCROLLSTask):
def _process_doc(self, doc): def _process_doc(self, doc):
doc = _process_doc_prepended_question(doc) doc = _process_doc_prepended_question(doc)
doc["is_yes_no"] = reduce(lambda prev, cur: prev and squad_metrics.normalize_answer(cur) doc["is_yes_no"] = reduce(
in ["yes", "no"], doc["outputs"], True) lambda prev, cur: prev
and squad_metrics.normalize_answer(cur) in ["yes", "no"],
doc["outputs"],
True,
)
return [doc] return [doc]
def _scrolls_metrics(self): def _scrolls_metrics(self):
...@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask): ...@@ -308,9 +326,7 @@ class Qasper(_SCROLLSTask):
prediction = "Unanswerable" prediction = "Unanswerable"
else: else:
prediction = results[0] prediction = results[0]
return { return {"f1": (prediction, doc["outputs"])}
"f1": (prediction, doc["outputs"])
}
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
if doc["is_yes_no"]: if doc["is_yes_no"]:
...@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask): ...@@ -318,7 +334,7 @@ class Qasper(_SCROLLSTask):
ll_no, _ = rf.loglikelihood(ctx, " no") ll_no, _ = rf.loglikelihood(ctx, " no")
return [ll_yes, ll_no] return [ll_yes, ll_no]
else: else:
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
class QuALITY(_SCROLLSMultipleChoiceTask): class QuALITY(_SCROLLSMultipleChoiceTask):
...@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask): ...@@ -340,8 +356,10 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
choices_text = doc["text"][:split] choices_text = doc["text"][:split]
doc["text"] = doc["text"][split:].strip() doc["text"] = doc["text"][split:].strip()
doc["choices"] = [QuALITY._normalize_answer(choice) for choice in re.split( doc["choices"] = [
QuALITY._multiple_choice_pattern, choices_text)[1:]] QuALITY._normalize_answer(choice)
for choice in re.split(QuALITY._multiple_choice_pattern, choices_text)[1:]
]
doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0])) doc["gold"] = doc["choices"].index(QuALITY._normalize_answer(doc["outputs"][0]))
return [doc] return [doc]
...@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask): ...@@ -368,12 +386,10 @@ class NarrativeQA(_SCROLLSTask):
return self._process_doc(doc)[0]["text"] return self._process_doc(doc)[0]["text"]
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {"f1": (results[0], doc["outputs"])}
"f1": (results[0], doc["outputs"])
}
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
return [rf.greedy_until(ctx, {'until': ["\n"]})] return [rf.greedy_until(ctx, {"until": ["\n"]})]
class ContractNLI(_SCROLLSMultipleChoiceTask): class ContractNLI(_SCROLLSMultipleChoiceTask):
...@@ -439,5 +455,5 @@ def construct_tasks(): ...@@ -439,5 +455,5 @@ def construct_tasks():
"scrolls_contractnli": ContractNLI, "scrolls_contractnli": ContractNLI,
"scrolls_govreport": GovReport, "scrolls_govreport": GovReport,
"scrolls_summscreenfd": SummScreenFD, "scrolls_summscreenfd": SummScreenFD,
"scrolls_qmsum": QMSum "scrolls_qmsum": QMSum,
} }
...@@ -76,8 +76,16 @@ class TriviaQA(Task): ...@@ -76,8 +76,16 @@ class TriviaQA(Task):
return continuation return continuation
def process_results(self, doc, results): def process_results(self, doc, results):
continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation)) continuation = (
list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in doc["answer"]["aliases"]] results[0]
.strip()
.lower()
.translate(str.maketrans("", "", string.punctuation))
)
list_of_candidates = [
alias.lower().translate(str.maketrans("", "", string.punctuation))
for alias in doc["answer"]["aliases"]
]
return {"em": float(continuation in list_of_candidates)} return {"em": float(continuation in list_of_candidates)}
def aggregation(self): def aggregation(self):
......
...@@ -12,17 +12,27 @@ def parse_args(): ...@@ -12,17 +12,27 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)) parser.add_argument(
"--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
)
parser.add_argument("--provide_description", action="store_true") parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None) parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None, parser.add_argument(
help="Maximal batch size to try with --batch_size auto") "--max_batch_size",
type=int,
default=None,
help="Maximal batch size to try with --batch_size auto",
)
parser.add_argument("--device", type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None) parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None, parser.add_argument(
"--limit",
type=float,
default=None,
help="Limit the number of examples per task. " help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.") "If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument("--data_sampling", type=float, default=None) parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true") parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None) parser.add_argument("--decontamination_ngrams_path", default=None)
...@@ -77,7 +87,9 @@ def main(): ...@@ -77,7 +87,9 @@ def main():
print(dumped) print(dumped)
if args.output_path: if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True) dirname = os.path.dirname(args.output_path)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
......
...@@ -9,7 +9,12 @@ from lm_eval import tasks, utils ...@@ -9,7 +9,12 @@ from lm_eval import tasks, utils
seq2seq_models = ["google/flan-t5-small"] seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"] causal_models = [
"gpt2",
"facebook/opt-125m",
"EleutherAI/gpt-neo-125m",
"EleutherAI/pythia-160m",
]
model_names = seq2seq_models + causal_models model_names = seq2seq_models + causal_models
...@@ -50,22 +55,41 @@ def eval_models(args, branch=None): ...@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
results = {} results = {}
for model in args.models: for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \ model_type = (
else "hf-seq2seq" if model in seq2seq_models else args.model "hf-causal-experimental"
if model in causal_models
else "hf-seq2seq"
if model in seq2seq_models
else args.model
)
model_args = f"pretrained={model},{args.model_args}" model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527 # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \ tasks = (
args.tasks
if model in causal_models or model_type == "hf-causal-experimental"
else list(filter(lambda task: task not in perplexity_tasks, args.tasks)) else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama # TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \ batch_size = (
else 64 if args.batch_size == "auto" else args.batch_size args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json" if model in causal_models or model_type == "hf-causal-experimental"
else 64
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \ if args.batch_size == "auto"
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \ else args.batch_size
)
output_path = (
f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
)
command = (
f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
f"--batch_size {batch_size} --no_cache --output_path {output_path}" f"--batch_size {batch_size} --no_cache --output_path {output_path}"
)
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}") print(
f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
)
ret = os.system(command) ret = os.system(command)
...@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task): ...@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
def main(): def main():
args = parse_args() args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches
)
args.models = args.models.split(",") if type(args.models) == str else args.models args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \ args.tasks = (
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS) tasks.ALL_TASKS
if args.tasks == "all_tasks"
else utils.pattern_match(
args.tasks.split(",") if type(args.tasks) == str else args.tasks,
tasks.ALL_TASKS,
)
)
global initial_branch global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip() initial_branch = (
subprocess.check_output("git branch --show-current", shell=True)
.decode("ascii")
.strip()
)
# TODO: implement proper timing for each task # TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models? # TODO: reduce IO by sharing tasks between models?
...@@ -132,10 +168,16 @@ def main(): ...@@ -132,10 +168,16 @@ def main():
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}") print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks: for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|") print(
f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
)
for branch, branch_results, branch_runtime in runs: for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|") print(
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|") f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
)
print(
f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
)
print("") print("")
print("|branch|runtime|%|") print("|branch|runtime|%|")
......
...@@ -13,9 +13,7 @@ setuptools.setup( ...@@ -13,9 +13,7 @@ setuptools.setup(
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness", url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
package_data={ package_data={"lm_eval": ["**/*.json"]},
"lm_eval": ["**/*.json"]
},
include_package_data=True, include_package_data=True,
classifiers=[ classifiers=[
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment