Commit 220e6456 authored by zhuwenwen's avatar zhuwenwen
Browse files

update qwen2-moe layout and benchmark_throughput.py

parent 96ae75ad
...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def run_vllm( def run_vllm(
warmup_requests: List[SampleRequest],
requests: List[SampleRequest], requests: List[SampleRequest],
n: int, n: int,
num_iters_warmup: int,
engine_args: EngineArgs, engine_args: EngineArgs,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -193,40 +193,23 @@ def run_vllm( ...@@ -193,40 +193,23 @@ def run_vllm(
lora_requests = [request.lora_request for request in requests] lora_requests = [request.lora_request for request in requests]
# warmup # warmup
warmup_prompts: List[TextPrompt] = [] warmup_sampling_params = SamplingParams(
warmup_sampling_params: List[SamplingParams] = [] n=args.n,
for request in warmup_requests:
warmup_prompts.append(
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
warmup_sampling_params.append(
SamplingParams(
n=n,
temperature=1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=10,
)) )
dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
dummy_prompts: List[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
print("Warming up...") print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True) llm.generate(dummy_prompts,
sampling_params=warmup_sampling_params,
# dummy_prompt_token_ids = np.random.randint(10000, use_tqdm=False)
# size=(args.num_prompts,
# args.input_len))
# dummy_prompts: List[PromptType] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search = False use_beam_search = False
...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace): ...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code) args.tokenizer, trust_remote_code=args.trust_remote_code)
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
if args.dataset is None: if args.dataset is None:
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
requests = [] requests = []
...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace): ...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
)) ))
else: else:
elapsed_time = run_vllm(warmup_requests, requests, args.n, elapsed_time = run_vllm(requests, args.n, args.num_iters_warmup,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args))
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
......
...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def run_vllm( def run_vllm(
warmup_requests: List[SampleRequest],
requests: List[SampleRequest], requests: List[SampleRequest],
n: int, n: int,
num_iters_warmup: int,
engine_args: EngineArgs, engine_args: EngineArgs,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -193,40 +193,23 @@ def run_vllm( ...@@ -193,40 +193,23 @@ def run_vllm(
lora_requests = [request.lora_request for request in requests] lora_requests = [request.lora_request for request in requests]
# warmup # warmup
warmup_prompts: List[TextPrompt] = [] warmup_sampling_params = SamplingParams(
warmup_sampling_params: List[SamplingParams] = [] n=args.n,
for request in warmup_requests:
warmup_prompts.append(
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
warmup_sampling_params.append(
SamplingParams(
n=n,
temperature=1.0, temperature=1.0,
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=10,
)) )
dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
dummy_prompts: List[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
print("Warming up...") print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True) llm.generate(dummy_prompts,
sampling_params=warmup_sampling_params,
# dummy_prompt_token_ids = np.random.randint(10000, use_tqdm=False)
# size=(args.num_prompts,
# args.input_len))
# dummy_prompts: List[PromptType] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search = False use_beam_search = False
...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace): ...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code) args.tokenizer, trust_remote_code=args.trust_remote_code)
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
if args.dataset is None: if args.dataset is None:
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
requests = [] requests = []
...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace): ...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
)) ))
else: else:
elapsed_time = run_vllm(warmup_requests, requests, args.n, elapsed_time = run_vllm(requests, args.n, args.num_iters_warmup,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args))
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment