Commit 220e6456 authored by zhuwenwen's avatar zhuwenwen
Browse files

update qwen2-moe layout and benchmark_throughput.py

parent 96ae75ad
...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def run_vllm( def run_vllm(
warmup_requests: List[SampleRequest],
requests: List[SampleRequest], requests: List[SampleRequest],
n: int, n: int,
num_iters_warmup: int,
engine_args: EngineArgs, engine_args: EngineArgs,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -193,40 +193,23 @@ def run_vllm( ...@@ -193,40 +193,23 @@ def run_vllm(
lora_requests = [request.lora_request for request in requests] lora_requests = [request.lora_request for request in requests]
# warmup # warmup
warmup_prompts: List[TextPrompt] = [] warmup_sampling_params = SamplingParams(
warmup_sampling_params: List[SamplingParams] = [] n=args.n,
for request in warmup_requests: temperature=1.0,
warmup_prompts.append( top_p=1.0,
TextPrompt(prompt=request.prompt, ignore_eos=True,
multi_modal_data=request.multi_modal_data)) max_tokens=10,
warmup_sampling_params.append( )
SamplingParams( dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
n=n, dummy_prompts: List[PromptType] = [{
temperature=1.0, "prompt_token_ids": batch
top_p=1.0, } for batch in dummy_prompt_token_ids.tolist()]
ignore_eos=True,
max_tokens=request.expected_output_len,
))
print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
# dummy_prompt_token_ids = np.random.randint(10000, print("Warming up...")
# size=(args.num_prompts, for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
# args.input_len)) llm.generate(dummy_prompts,
# dummy_prompts: List[PromptType] = [{ sampling_params=warmup_sampling_params,
# "prompt_token_ids": batch use_tqdm=False)
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search = False use_beam_search = False
...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace): ...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code) args.tokenizer, trust_remote_code=args.trust_remote_code)
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
if args.dataset is None: if args.dataset is None:
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
requests = [] requests = []
...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace): ...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
)) ))
else: else:
elapsed_time = run_vllm(warmup_requests, requests, args.n, elapsed_time = run_vllm(requests, args.n, args.num_iters_warmup,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args))
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
......
...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def run_vllm( def run_vllm(
warmup_requests: List[SampleRequest],
requests: List[SampleRequest], requests: List[SampleRequest],
n: int, n: int,
num_iters_warmup: int,
engine_args: EngineArgs, engine_args: EngineArgs,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -193,40 +193,23 @@ def run_vllm( ...@@ -193,40 +193,23 @@ def run_vllm(
lora_requests = [request.lora_request for request in requests] lora_requests = [request.lora_request for request in requests]
# warmup # warmup
warmup_prompts: List[TextPrompt] = [] warmup_sampling_params = SamplingParams(
warmup_sampling_params: List[SamplingParams] = [] n=args.n,
for request in warmup_requests: temperature=1.0,
warmup_prompts.append( top_p=1.0,
TextPrompt(prompt=request.prompt, ignore_eos=True,
multi_modal_data=request.multi_modal_data)) max_tokens=10,
warmup_sampling_params.append( )
SamplingParams( dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
n=n, dummy_prompts: List[PromptType] = [{
temperature=1.0, "prompt_token_ids": batch
top_p=1.0, } for batch in dummy_prompt_token_ids.tolist()]
ignore_eos=True,
max_tokens=request.expected_output_len,
))
print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
# dummy_prompt_token_ids = np.random.randint(10000, print("Warming up...")
# size=(args.num_prompts, for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
# args.input_len)) llm.generate(dummy_prompts,
# dummy_prompts: List[PromptType] = [{ sampling_params=warmup_sampling_params,
# "prompt_token_ids": batch use_tqdm=False)
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search = False use_beam_search = False
...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace): ...@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests. # Sample the requests.
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code) args.tokenizer, trust_remote_code=args.trust_remote_code)
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
if args.dataset is None: if args.dataset is None:
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
requests = [] requests = []
...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace): ...@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
)) ))
else: else:
elapsed_time = run_vllm(warmup_requests, requests, args.n, elapsed_time = run_vllm(requests, args.n, args.num_iters_warmup,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args))
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
......
...@@ -553,7 +553,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP): ...@@ -553,7 +553,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
weight_loader = getattr(param, "weight_loader", weight_loader = getattr(param, "weight_loader",
default_weight_loader) default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
loaded_params.add(name) loaded_params.add(name)
if self.use_llama_nn and self.quant_method is None: if self.use_llama_nn and self.quant_method is None:
lay_key_words = [ lay_key_words = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment