from typing import List import pytest import torch from lm_eval import evaluate, simple_evaluate, tasks from lm_eval.api.instance import Instance from lm_eval.tasks import get_task_dict task_manager = tasks.TaskManager() # We refer to vLLM's test but modify the trigger condition. @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA") # @pytest.mark.skip(reason="requires CUDA") class Test_SGlang: sglang = pytest.importorskip("sglang") task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"]) multiple_choice_task = task_list["arc_easy"] # type: ignore multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) MULTIPLE_CH: List[Instance] = multiple_choice_task.instances generate_until_task = task_list["gsm8k"] # type: ignore generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) generate_until: List[Instance] = generate_until_task.instances rolling_task = task_list["wikitext"] # type: ignore rolling_task.build_all_requests(limit=10, rank=0, world_size=1) ROLLING: List[Instance] = rolling_task.instances @classmethod def setup_class(cls): try: from lm_eval.models.sglang_causallms import SGLangLM # NOTE(jinwei): EleutherAI/pythia-70m is not supported by SGlang yet. Instead we use Qwen models. cls.LM = SGLangLM( pretrained="Qwen/Qwen2-1.5B-Instruct", batch_size=1, tp_size=1, max_model_len=1024, ) except Exception as e: pytest.fail(f"🔥 SGLangLM failed to initialize: {e}") def test_logliklihood(self) -> None: res = self.LM.loglikelihood(self.MULTIPLE_CH) assert len(res) == len(self.MULTIPLE_CH) for x in res: assert isinstance(x[0], float) def test_generate_until(self) -> None: res = self.LM.generate_until(self.generate_until) assert len(res) == len(self.generate_until) for x in res: assert isinstance(x, str) # NOTE(Jinwei):A100 80GB is enough for our tests. If you run the last test "test_logliklihood_rolling" and OOM happens, please reduce the "max_model_len". def test_logliklihood_rolling(self) -> None: res = self.LM.loglikelihood_rolling(self.ROLLING) for x in res: assert isinstance(x, float) # def test_simple_evaluate(self)-> None: # results = simple_evaluate( # model =self.LM, # tasks=["arc_easy"], # # num_fewshot=0, # task_manager=task_manager, # limit= 10, # ) # print(results) # accuracy = results["results"]["arc_easy"]["acc,none"] # print(f"Accuracy: {accuracy}") # def test_evaluate(self)-> None: # tasks=["arc_easy"] # task_dict = get_task_dict(tasks, task_manager) # results = evaluate( # lm=self.LM, # task_dict=task_dict, # limit= 10, # ) # print(results) # accuracy = results["results"]["arc_easy"]["acc,none"] # print(f"Accuracy: {accuracy}") # TODO(jinwei): find out the outpt differences for "gsm_8k" with simple_evalute() and evaluate(). There are some errors in parser as well. def test_evaluator(self) -> None: simple_results = simple_evaluate( model=self.LM, tasks=["arc_easy"], task_manager=task_manager, limit=10, ) assert simple_results is not None, "simple_evaluate returned None" # The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations. assert simple_results["results"]["arc_easy"]["acc,none"] >= 0.5, ( "The accuracy for simple_evaluate() is below 0.5!" ) task_dict = get_task_dict(["arc_easy"], task_manager) evaluate_results = evaluate( lm=self.LM, task_dict=task_dict, limit=10, ) assert evaluate_results is not None, "evaluate returned None" # The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations. assert evaluate_results["results"]["arc_easy"]["acc,none"] >= 0.5, ( "The accuracy for evaluate() is below 0.5!" ) assert set(simple_results["results"].keys()) == set( evaluate_results["results"].keys() ), "Mismatch in task keys between simple_evaluate and evaluate" for task in simple_results["results"]: assert ( simple_results["results"][task] == evaluate_results["results"][task] ), f"Mismatch in results for {task}" print( "✅ test_evaluator passed: simple_evaluate and evaluate results are identical." )