Commit efb46937 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into convert_gen

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/evaluator.py
parents 7fbf899c ade01428
task: cocoteros_es
dataset_path: gplsi/cocoteros
dataset_name: null
output_type: generate_until
doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
doc_to_target: "{{text}}"
training_split: train
test_split: test
target_delimiter: ' '
generation_kwargs:
max_gen_toks: 40
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
...@@ -13,5 +13,6 @@ task: ...@@ -13,5 +13,6 @@ task:
- mgsm_direct_es_spanish_bench - mgsm_direct_es_spanish_bench
- flores_es - flores_es
- phrases_es - phrases_es
- cocoteros_es
metadata: metadata:
version: 1.0 version: 1.0
...@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask): ...@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask): ...@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
answer = "unanswerable" answer = "unanswerable"
return " " + answer return " " + answer
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask): ...@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -9,7 +9,7 @@ fewshot_config: ...@@ -9,7 +9,7 @@ fewshot_config:
output_type: multiple_choice output_type: multiple_choice
doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:" doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
doc_to_choice: ["A", "B", "C", "D", "E"] doc_to_choice: ["A", "B", "C", "D", "E"]
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}" doc_to_target: "{{ answer.strip() }}"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask): ...@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None, chat_template: Optional[Callable] = None,
gen_prefix: Optional[str] = None,
) -> str: ) -> str:
source = self.doc_to_text(doc) source = self.doc_to_text(doc)
if isinstance(source, list): if isinstance(source, list):
...@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask): ...@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
part of the document for `doc`. part of the document for `doc`.
""" """
kwargs.pop("apply_chat_template", False) # Not used by unitxt kwargs.pop("apply_chat_template", False) # Not used by unitxt
kwargs.pop("chat_template", False) # Not used by unitxt
return [ return [
Instance( Instance(
request_type="generate_until", request_type="generate_until",
......
...@@ -17,13 +17,6 @@ import yaml ...@@ -17,13 +17,6 @@ import yaml
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47 SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = { HIGHER_IS_BETTER_SYMBOLS = {
...@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = { ...@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
} }
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
level_map = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
log_level = level_map.get(str(log_level).upper(), logging.INFO)
if not logging.root.handlers:
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=log_level,
)
if log_level == logging.DEBUG:
third_party_loggers = ["urllib3", "filelock", "fsspec"]
for logger_name in third_party_loggers:
logging.getLogger(logger_name).setLevel(logging.INFO)
else:
logging.getLogger().setLevel(log_level)
def hash_string(string: str) -> str: def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest() return hashlib.sha256(string.encode("utf-8")).hexdigest()
......
...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"] ...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"] neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
optimum = ["optimum[openvino]"] optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"] promptsource = ["promptsource>=0.2.3"]
......
import argparse import argparse
import logging
import os import os
import yaml import yaml
from promptsource.templates import DatasetTemplates from promptsource.templates import DatasetTemplates
from tqdm import tqdm from tqdm import tqdm
# from lm_eval.api.registry import ALL_TASKS # from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger eval_logger = logging.getLogger(__name__)
# from lm_eval.tasks import include_task_folder # from lm_eval.tasks import include_task_folder
......
...@@ -10,7 +10,6 @@ import os ...@@ -10,7 +10,6 @@ import os
from pytablewriter import LatexTableWriter, MarkdownTableWriter from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter ...@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
from lm_eval import tasks from lm_eval import tasks
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import argparse import argparse
import logging
import os import os
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
...@@ -8,11 +9,11 @@ import torch ...@@ -8,11 +9,11 @@ import torch
import lm_eval.evaluator import lm_eval.evaluator
import lm_eval.models.utils import lm_eval.models.utils
from lm_eval import tasks, utils from lm_eval import tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
def memory_stats(): def memory_stats():
......
...@@ -4,6 +4,7 @@ Usage: ...@@ -4,6 +4,7 @@ Usage:
""" """
import argparse import argparse
import logging
import os import os
from typing import List from typing import List
...@@ -14,7 +15,9 @@ from transformers import ( ...@@ -14,7 +15,9 @@ from transformers import (
from lm_eval import simple_evaluate from lm_eval import simple_evaluate
from lm_eval.evaluator import request_caching_arg_to_dict from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
......
import argparse import argparse
import logging
import os import os
import random import random
...@@ -7,7 +8,10 @@ import numpy as np ...@@ -7,7 +8,10 @@ import numpy as np
from lm_eval import tasks from lm_eval import tasks
from lm_eval.evaluator_utils import get_task_list from lm_eval.evaluator_utils import get_task_list
from lm_eval.tasks import TaskManager from lm_eval.tasks import TaskManager
from lm_eval.utils import eval_logger, join_iters from lm_eval.utils import join_iters
eval_logger = logging.getLogger(__name__)
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......
import argparse import argparse
import json import json
import logging
import os import os
import re import re
from pathlib import Path from pathlib import Path
...@@ -8,13 +9,15 @@ import pandas as pd ...@@ -8,13 +9,15 @@ import pandas as pd
from zeno_client import ZenoClient, ZenoMetric from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger,
get_latest_filename, get_latest_filename,
get_results_filenames, get_results_filenames,
get_sample_results_filenames, get_sample_results_filenames,
) )
eval_logger = logging.getLogger(__name__)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk." description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
......
from typing import List
import pytest
import torch
from lm_eval import evaluate, simple_evaluate, tasks
from lm_eval.api.instance import Instance
from lm_eval.tasks import get_task_dict
task_manager = tasks.TaskManager()
# We refer to vLLM's test but modify the trigger condition.
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
# @pytest.mark.skip(reason="requires CUDA")
class Test_SGlang:
sglang = pytest.importorskip("sglang")
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: List[Instance] = generate_until_task.instances
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: List[Instance] = rolling_task.instances
@classmethod
def setup_class(cls):
try:
from lm_eval.models.sglang_causallms import SGLangLM
# NOTE(jinwei): EleutherAI/pythia-70m is not supported by SGlang yet. Instead we use Qwen models.
cls.LM = SGLangLM(
pretrained="Qwen/Qwen2-1.5B-Instruct",
batch_size=1,
tp_size=1,
max_model_len=1024,
)
except Exception as e:
pytest.fail(f"🔥 SGLangLM failed to initialize: {e}")
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
assert len(res) == len(self.MULTIPLE_CH)
for x in res:
assert isinstance(x[0], float)
def test_generate_until(self) -> None:
res = self.LM.generate_until(self.generate_until)
assert len(res) == len(self.generate_until)
for x in res:
assert isinstance(x, str)
# NOTE(Jinwei):A100 80GB is enough for our tests. If you run the last test "test_logliklihood_rolling" and OOM happens, please reduce the "max_model_len".
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
for x in res:
assert isinstance(x, float)
# def test_simple_evaluate(self)-> None:
# results = simple_evaluate(
# model =self.LM,
# tasks=["arc_easy"],
# # num_fewshot=0,
# task_manager=task_manager,
# limit= 10,
# )
# print(results)
# accuracy = results["results"]["arc_easy"]["acc,none"]
# print(f"Accuracy: {accuracy}")
# def test_evaluate(self)-> None:
# tasks=["arc_easy"]
# task_dict = get_task_dict(tasks, task_manager)
# results = evaluate(
# lm=self.LM,
# task_dict=task_dict,
# limit= 10,
# )
# print(results)
# accuracy = results["results"]["arc_easy"]["acc,none"]
# print(f"Accuracy: {accuracy}")
# TODO(jinwei): find out the outpt differences for "gsm_8k" with simple_evalute() and evaluate(). There are some errors in parser as well.
def test_evaluator(self) -> None:
simple_results = simple_evaluate(
model=self.LM,
tasks=["arc_easy"],
task_manager=task_manager,
limit=10,
)
assert simple_results is not None, "simple_evaluate returned None"
# The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
assert simple_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
"The accuracy for simple_evaluate() is below 0.5!"
)
task_dict = get_task_dict(["arc_easy"], task_manager)
evaluate_results = evaluate(
lm=self.LM,
task_dict=task_dict,
limit=10,
)
assert evaluate_results is not None, "evaluate returned None"
# The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
assert evaluate_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
"The accuracy for evaluate() is below 0.5!"
)
assert set(simple_results["results"].keys()) == set(
evaluate_results["results"].keys()
), "Mismatch in task keys between simple_evaluate and evaluate"
for task in simple_results["results"]:
assert (
simple_results["results"][task] == evaluate_results["results"][task]
), f"Mismatch in results for {task}"
print(
"✅ test_evaluator passed: simple_evaluate and evaluate results are identical."
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment