Commit b2e1bfc6 authored by artemorloff's avatar artemorloff
Browse files

Merge remote-tracking branch 'origin' into feature/eval_from_config

parents b5d16d61 e4a7b69f
......@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test
dataset_name: trec_e
doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
doc_to_target: '{{answers}}'
doc_to_target: '{{answers[0]}}'
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
until: ['\n']
metric_list:
- metric: !function metrics.classification_score
aggregation: mean
higher_is_better: True
metadata:
version: 1.0
version: 2.0
......@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test
dataset_name: triviaqa
doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
doc_to_target: '{{answers}}'
doc_to_target: '{{answers[0]}}'
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
until: ['\n']
metric_list:
- metric: !function metrics.qa_f1_score
aggregation: mean
higher_is_better: True
metadata:
version: 1.0
version: 2.0
......@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test
dataset_name: triviaqa_e
doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
doc_to_target: '{{answers}}'
doc_to_target: '{{answers[0]}}'
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
until: ['\n']
metric_list:
- metric: !function metrics.qa_f1_score
aggregation: mean
higher_is_better: True
metadata:
version: 1.0
version: 2.0
tag:
- longbench
task: longbench_vcsum
dataset_path: THUDM/LongBench
test_split: test
dataset_name: vcsum
doc_to_text: '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{{context}}\n\n会议总结:'
doc_to_target: '{{answers[0]}}'
generation_kwargs:
max_gen_toks: 512
temperature: 1
do_sample: True
until: []
metric_list:
- metric: !function metrics.rouge_zh_score
aggregation: mean
higher_is_better: True
metadata:
version: 2.0
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
dataset_path: cais/mmlu
test_split: test
fewshot_split: dev
fewshot_config:
......
......@@ -60,7 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
audiolm_qwen = ["librosa", "soundfile"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
gptq = ["auto-gptq[triton]>=0.6.0"]
gptqmodel = ["gptqmodel>=1.0.9"]
hf_transfer = ["hf_transfer"]
......@@ -69,7 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench=["jieba", "fuzzywuzzy", "rouge"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
neuronx = ["optimum[neuronx]"]
......@@ -132,3 +132,8 @@ known-first-party = ["lm_eval"]
[tool.ruff.lint.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403"]
"utils.py" = ["F401"]
[dependency-groups]
dev = [
"api","dev","sentencepiece"
]
......@@ -18,7 +18,7 @@ def custom_task_tag():
@pytest.fixture(scope="module")
def task_yaml(pytestconfig, custom_task_name, custom_task_tag):
yield f"""include: {pytestconfig.rootpath}/lm_eval/tasks/hellaswag/hellaswag.yaml
yield f"""include: {pytestconfig.rootpath}/lm_eval/tasks/arc/arc_easy.yaml
task: {custom_task_name}
class: !function {custom_task_name}.MockPythonTask
tag:
......
......@@ -14,7 +14,7 @@ from .utils import new_tasks
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Default Task
TASKS = ["include_base_44_dutch_few_shot_en_applied_science"]
TASKS = ["arc_easy"]
def get_new_tasks_else_default():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment