"src/vscode:/vscode.git/clone" did not exist on "8092c8bda17e670da3168037fbf8b88b8252e59b"
Commit 9c0d4e93 authored by baberabb's avatar baberabb
Browse files

test data parallel

parent bf26d979
from collections import defaultdict from collections import defaultdict
from typing import List, Tuple, Optional, Literal, Union from typing import List, Tuple, Optional, Literal, Union
from transformers import AutoTokenizer
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.model import LM from lm_eval.api.model import LM
import copy import copy
from tqdm import tqdm from tqdm import tqdm
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval import utils from lm_eval import utils
from ray.util.multiprocessing import Pool
try: try:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -17,6 +19,11 @@ except ModuleNotFoundError: ...@@ -17,6 +19,11 @@ except ModuleNotFoundError:
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
def run_inference_one_gpu(model_args: dict, sampling_params, requests: List[int]):
llm = LLM(**model_args)
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
@register_model("vllm") @register_model("vllm")
class VLLM(LM): class VLLM(LM):
_DEFAULT_MAX_LENGTH = 2048 _DEFAULT_MAX_LENGTH = 2048
...@@ -38,6 +45,7 @@ class VLLM(LM): ...@@ -38,6 +45,7 @@ class VLLM(LM):
seed: int = 1234, seed: int = 1234,
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
device: str = "cuda", device: str = "cuda",
data_parallel: int = 1,
): ):
super().__init__() super().__init__()
...@@ -50,19 +58,28 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`" ...@@ -50,19 +58,28 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
) )
assert "cuda" in device or device is None, "vLLM only supports CUDA" assert "cuda" in device or device is None, "vLLM only supports CUDA"
self.model = LLM( self.tensor_parallel_size = int(tensor_parallel_size)
model=pretrained, self.data_parallel = int(data_parallel)
gpu_memory_utilization=float(gpu_memory_utilization), self.model_args = {
"model": pretrained,
"gpu_memory_utilization": float(gpu_memory_utilization),
"revision": revision,
"dtype": dtype,
"tokenizer_mode": tokenizer_mode,
"trust_remote_code": trust_remote_code,
"tensor_parallel_size": int(tensor_parallel_size),
"swap_space": int(swap_space),
"quantization": quantization,
"seed": int(seed),
}
if self.data_parallel <= 1:
self.model = LLM(**self.model_args)
self.tokenizer = AutoTokenizer.from_pretrained(
pretrained,
revision=revision, revision=revision,
dtype=dtype,
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
tensor_parallel_size=int(tensor_parallel_size), use_fast=True if tokenizer_mode == "auto" else False,
swap_space=int(swap_space),
quantization=quantization,
seed=int(seed),
) )
self.tokenizer = self.model.get_tokenizer()
self.batch_size = batch_size self.batch_size = batch_size
self._max_length = max_length self._max_length = max_length
self._max_gen_toks = max_gen_toks self._max_gen_toks = max_gen_toks
...@@ -76,8 +93,8 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`" ...@@ -76,8 +93,8 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
return self._max_length return self._max_length
if hasattr(self.model.llm_engine.model_config, "max_model_len"): if hasattr(self.tokenizer, "model_max_length"):
return self.model.llm_engine.model_config.max_model_len return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
@property @property
...@@ -114,23 +131,31 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`" ...@@ -114,23 +131,31 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
if "do_sample" in kwargs.keys(): if "do_sample" in kwargs.keys():
kwargs.pop("do_sample") kwargs.pop("do_sample")
if generate: if generate:
generate_sampling_params = SamplingParams( sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
max_tokens=max_tokens, stop=stop, **kwargs
)
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=generate_sampling_params,
use_tqdm=use_tqdm,
)
else: else:
logliklihood_sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, prompt_logprobs=2, max_tokens=1 temperature=0, prompt_logprobs=2, max_tokens=1
) )
outputs = self.model.generate( if self.data_parallel > 1:
prompt_token_ids=requests, req_list = []
sampling_params=logliklihood_sampling_params, for replicas in range(self.data_parallel):
use_tqdm=use_tqdm, reqs = utils.create_iterator(
) requests, rank=replicas, world_size=self.data_parallel
)
req_list.append(reqs)
inputs = [(self.model_args, sampling_params, req) for req in req_list]
with Pool(processes=self.data_parallel) as pool:
results = pool.starmap(run_inference_one_gpu, inputs)
# flatten results
return [item for sublist in results for item in sublist]
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=use_tqdm,
)
return outputs return outputs
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment