Commit efb46937 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into convert_gen

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/evaluator.py
parents 7fbf899c ade01428
task: cocoteros_es
dataset_path: gplsi/cocoteros
dataset_name: null
output_type: generate_until
doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
doc_to_target: "{{text}}"
training_split: train
test_split: test
target_delimiter: ' '
generation_kwargs:
max_gen_toks: 40
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
......@@ -13,5 +13,6 @@ task:
- mgsm_direct_es_spanish_bench
- flores_es
- phrases_es
- cocoteros_es
metadata:
version: 1.0
......@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
def doc_to_target(self, doc):
return doc["value"]
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
answer = "unanswerable"
return " " + answer
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
def doc_to_target(self, doc):
return doc["value"]
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -9,7 +9,7 @@ fewshot_config:
output_type: multiple_choice
doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
doc_to_choice: ["A", "B", "C", "D", "E"]
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}"
doc_to_target: "{{ answer.strip() }}"
metric_list:
- metric: acc
aggregation: mean
......
......@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
gen_prefix: Optional[str] = None,
) -> str:
source = self.doc_to_text(doc)
if isinstance(source, list):
......@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
part of the document for `doc`.
"""
kwargs.pop("apply_chat_template", False) # Not used by unitxt
kwargs.pop("chat_template", False) # Not used by unitxt
return [
Instance(
request_type="generate_until",
......
......@@ -17,13 +17,6 @@ import yaml
from jinja2 import BaseLoader, Environment, StrictUndefined
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = {
......@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
}
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
level_map = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
log_level = level_map.get(str(log_level).upper(), logging.INFO)
if not logging.root.handlers:
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=log_level,
)
if log_level == logging.DEBUG:
third_party_loggers = ["urllib3", "filelock", "fsspec"]
for logger_name in third_party_loggers:
logging.getLogger(logger_name).setLevel(logging.INFO)
else:
logging.getLogger().setLevel(log_level)
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
......
......@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"]
......
import argparse
import logging
import os
import yaml
from promptsource.templates import DatasetTemplates
from tqdm import tqdm
# from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger
eval_logger = logging.getLogger(__name__)
# from lm_eval.tasks import include_task_folder
......
......@@ -10,7 +10,6 @@ import os
from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
......
......@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
from lm_eval import tasks
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
......
import argparse
import logging
import os
from typing import Dict, List, Tuple
......@@ -8,11 +9,11 @@ import torch
import lm_eval.evaluator
import lm_eval.models.utils
from lm_eval import tasks, utils
from lm_eval import tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
def memory_stats():
......
......@@ -4,6 +4,7 @@ Usage:
"""
import argparse
import logging
import os
from typing import List
......@@ -14,7 +15,9 @@ from transformers import (
from lm_eval import simple_evaluate
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
......
import argparse
import logging
import os
import random
......@@ -7,7 +8,10 @@ import numpy as np
from lm_eval import tasks
from lm_eval.evaluator_utils import get_task_list
from lm_eval.tasks import TaskManager
from lm_eval.utils import eval_logger, join_iters
from lm_eval.utils import join_iters
eval_logger = logging.getLogger(__name__)
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......
import argparse
import json
import logging
import os
import re
from pathlib import Path
......@@ -8,13 +9,15 @@ import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
eval_logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
......
from typing import List
import pytest
import torch
from lm_eval import evaluate, simple_evaluate, tasks
from lm_eval.api.instance import Instance
from lm_eval.tasks import get_task_dict
task_manager = tasks.TaskManager()
# We refer to vLLM's test but modify the trigger condition.
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
# @pytest.mark.skip(reason="requires CUDA")
class Test_SGlang:
sglang = pytest.importorskip("sglang")
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: List[Instance] = generate_until_task.instances
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: List[Instance] = rolling_task.instances
@classmethod
def setup_class(cls):
try:
from lm_eval.models.sglang_causallms import SGLangLM
# NOTE(jinwei): EleutherAI/pythia-70m is not supported by SGlang yet. Instead we use Qwen models.
cls.LM = SGLangLM(
pretrained="Qwen/Qwen2-1.5B-Instruct",
batch_size=1,
tp_size=1,
max_model_len=1024,
)
except Exception as e:
pytest.fail(f"🔥 SGLangLM failed to initialize: {e}")
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
assert len(res) == len(self.MULTIPLE_CH)
for x in res:
assert isinstance(x[0], float)
def test_generate_until(self) -> None:
res = self.LM.generate_until(self.generate_until)
assert len(res) == len(self.generate_until)
for x in res:
assert isinstance(x, str)
# NOTE(Jinwei):A100 80GB is enough for our tests. If you run the last test "test_logliklihood_rolling" and OOM happens, please reduce the "max_model_len".
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
for x in res:
assert isinstance(x, float)
# def test_simple_evaluate(self)-> None:
# results = simple_evaluate(
# model =self.LM,
# tasks=["arc_easy"],
# # num_fewshot=0,
# task_manager=task_manager,
# limit= 10,
# )
# print(results)
# accuracy = results["results"]["arc_easy"]["acc,none"]
# print(f"Accuracy: {accuracy}")
# def test_evaluate(self)-> None:
# tasks=["arc_easy"]
# task_dict = get_task_dict(tasks, task_manager)
# results = evaluate(
# lm=self.LM,
# task_dict=task_dict,
# limit= 10,
# )
# print(results)
# accuracy = results["results"]["arc_easy"]["acc,none"]
# print(f"Accuracy: {accuracy}")
# TODO(jinwei): find out the outpt differences for "gsm_8k" with simple_evalute() and evaluate(). There are some errors in parser as well.
def test_evaluator(self) -> None:
simple_results = simple_evaluate(
model=self.LM,
tasks=["arc_easy"],
task_manager=task_manager,
limit=10,
)
assert simple_results is not None, "simple_evaluate returned None"
# The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
assert simple_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
"The accuracy for simple_evaluate() is below 0.5!"
)
task_dict = get_task_dict(["arc_easy"], task_manager)
evaluate_results = evaluate(
lm=self.LM,
task_dict=task_dict,
limit=10,
)
assert evaluate_results is not None, "evaluate returned None"
# The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
assert evaluate_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
"The accuracy for evaluate() is below 0.5!"
)
assert set(simple_results["results"].keys()) == set(
evaluate_results["results"].keys()
), "Mismatch in task keys between simple_evaluate and evaluate"
for task in simple_results["results"]:
assert (
simple_results["results"][task] == evaluate_results["results"][task]
), f"Mismatch in results for {task}"
print(
"✅ test_evaluator passed: simple_evaluate and evaluate results are identical."
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment