Commit e7c1b7f3 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.5.4-dtk24.04.1'

parents 7462218e 04c62b93
...@@ -11,8 +11,10 @@ from tqdm import tqdm ...@@ -11,8 +11,10 @@ from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
from vllm.inputs import PromptStrictInputs from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def sample_requests( def sample_requests(
...@@ -84,6 +86,7 @@ def run_vllm( ...@@ -84,6 +86,7 @@ def run_vllm(
distributed_executor_backend: Optional[str], distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None, download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM( llm = LLM(
...@@ -105,11 +108,12 @@ def run_vllm( ...@@ -105,11 +108,12 @@ def run_vllm(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
) )
# Add the requests to the engine. # Add the requests to the engine.
prompts = [] prompts: List[str] = []
sampling_params = [] sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests: for prompt, _, output_len in requests:
prompts.append(prompt) prompts.append(prompt)
sampling_params.append( sampling_params.append(
...@@ -144,7 +148,7 @@ def run_vllm( ...@@ -144,7 +148,7 @@ def run_vllm(
# dummy_prompt_token_ids = np.random.randint(10000, # dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts, # size=(args.num_prompts,
# args.input_len)) # args.input_len))
# dummy_inputs: List[PromptStrictInputs] = [{ # dummy_inputs: List[PromptInputs] = [{
# "prompt_token_ids": batch # "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()] # } for batch in dummy_prompt_token_ids.tolist()]
...@@ -270,7 +274,7 @@ def main(args: argparse.Namespace): ...@@ -270,7 +274,7 @@ def main(args: argparse.Namespace):
args.quantization_param_path, args.device, args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill, args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend, args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.download_dir) args.gpu_memory_utilization, args.download_dir, args.load_format)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
...@@ -283,6 +287,7 @@ def main(args: argparse.Namespace): ...@@ -283,6 +287,7 @@ def main(args: argparse.Namespace):
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len in requests) for _, prompt_len, output_len in requests)
if args.dataset is None: if args.dataset is None:
total_out_tokens = args.output_len * args.num_prompts total_out_tokens = args.output_len * args.num_prompts
else: else:
...@@ -307,7 +312,7 @@ def main(args: argparse.Namespace): ...@@ -307,7 +312,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend", parser.add_argument("--backend",
type=str, type=str,
choices=["vllm", "hf", "mii"], choices=["vllm", "hf", "mii"],
...@@ -398,9 +403,10 @@ if __name__ == "__main__": ...@@ -398,9 +403,10 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--device", "--device",
type=str, type=str,
default="cuda", default="auto",
choices=["cuda", "cpu"], choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA and CPU.') help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument( parser.add_argument(
"--enable-prefix-caching", "--enable-prefix-caching",
action='store_true', action='store_true',
...@@ -430,6 +436,29 @@ if __name__ == "__main__": ...@@ -430,6 +436,29 @@ if __name__ == "__main__":
help='Backend to use for distributed serving. When more than 1 GPU ' help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed ' 'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.') 'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
......
...@@ -3,52 +3,9 @@ from typing import List ...@@ -3,52 +3,9 @@ from typing import List
from vllm.utils import Device from vllm.utils import Device
_BLANK_TOKEN_ID = -1
DEFAULT_LAST_ACCESSED_TIME = -1 DEFAULT_LAST_ACCESSED_TIME = -1
class LogicalTokenBlock:
"""A block that stores a contiguous chunk of tokens from left to right.
Logical blocks are used to represent the states of the corresponding
physical blocks in the KV cache.
"""
def __init__(
self,
block_number: int,
block_size: int,
) -> None:
self.block_number = block_number
self.block_size = block_size
self.token_ids = [_BLANK_TOKEN_ID] * block_size
self.num_tokens = 0
def is_empty(self) -> bool:
return self.num_tokens == 0
def get_num_empty_slots(self) -> int:
return self.block_size - self.num_tokens
def is_full(self) -> bool:
return self.num_tokens == self.block_size
def append_tokens(self, token_ids: List[int]) -> None:
assert len(token_ids) <= self.get_num_empty_slots()
curr_idx = self.num_tokens
self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
self.num_tokens += len(token_ids)
def get_token_ids(self) -> List[int]:
return self.token_ids[:self.num_tokens]
def get_last_token_id(self) -> int:
assert self.num_tokens > 0
return self.token_ids[self.num_tokens - 1]
class PhysicalTokenBlock: class PhysicalTokenBlock:
"""Represents the state of a block in the KV cache.""" """Represents the state of a block in the KV cache."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment