Unverified Commit 96b6f475 authored by Kunshang Ji's avatar Kunshang Ji Committed by GitHub
Browse files

Remove hardcoded `device="cuda" ` to support more devices (#2503)


Co-authored-by: default avatarJiang Li <jiang1.li@intel.com>
Co-authored-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent c410f5d0
...@@ -25,6 +25,7 @@ def main(args: argparse.Namespace): ...@@ -25,6 +25,7 @@ def main(args: argparse.Namespace):
dtype=args.dtype, dtype=args.dtype,
enforce_eager=args.enforce_eager, enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype, kv_cache_dtype=args.kv_cache_dtype,
device=args.device,
) )
sampling_params = SamplingParams( sampling_params = SamplingParams(
...@@ -135,5 +136,11 @@ if __name__ == '__main__': ...@@ -135,5 +136,11 @@ if __name__ == '__main__':
default=None, default=None,
help=('path to save the pytorch profiler output. Can be visualized ' help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.')) 'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -72,6 +72,7 @@ def run_vllm( ...@@ -72,6 +72,7 @@ def run_vllm(
max_model_len: Optional[int], max_model_len: Optional[int],
enforce_eager: bool, enforce_eager: bool,
kv_cache_dtype: str, kv_cache_dtype: str,
device: str,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM( llm = LLM(
...@@ -85,6 +86,7 @@ def run_vllm( ...@@ -85,6 +86,7 @@ def run_vllm(
max_model_len=max_model_len, max_model_len=max_model_len,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
device=device,
) )
# Add the requests to the engine. # Add the requests to the engine.
...@@ -209,7 +211,7 @@ def main(args: argparse.Namespace): ...@@ -209,7 +211,7 @@ def main(args: argparse.Namespace):
args.seed, args.n, args.use_beam_search, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.trust_remote_code, args.dtype,
args.max_model_len, args.enforce_eager, args.max_model_len, args.enforce_eager,
args.kv_cache_dtype) args.kv_cache_dtype, args.device)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
...@@ -294,6 +296,12 @@ if __name__ == "__main__": ...@@ -294,6 +296,12 @@ if __name__ == "__main__":
default="auto", default="auto",
help= help=
'Data type for kv cache storage. If "auto", will use model data type.') 'Data type for kv cache storage. If "auto", will use model data type.')
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
......
...@@ -25,18 +25,20 @@ def main( ...@@ -25,18 +25,20 @@ def main(
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
do_profile: bool, do_profile: bool,
device: str = "cuda",
kv_cache_dtype: Optional[str] = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs, query = torch.empty(num_seqs,
num_query_heads, num_query_heads,
head_size, head_size,
dtype=dtype, dtype=dtype,
device="cuda") device=device)
query.uniform_(-scale, scale) query.uniform_(-scale, scale)
assert num_query_heads % num_kv_heads == 0 assert num_query_heads % num_kv_heads == 0
...@@ -44,11 +46,11 @@ def main( ...@@ -44,11 +46,11 @@ def main(
if use_alibi: if use_alibi:
alibi_slopes = torch.randn(num_query_heads, alibi_slopes = torch.randn(num_query_heads,
dtype=torch.float, dtype=torch.float,
device="cuda") device=device)
context_lens = [context_len for _ in range(num_seqs)] context_lens = [context_len for _ in range(num_seqs)]
max_context_len = max(context_lens) max_context_len = max(context_lens)
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda") context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
...@@ -59,12 +61,17 @@ def main( ...@@ -59,12 +61,17 @@ def main(
for _ in range(max_num_blocks_per_seq) for _ in range(max_num_blocks_per_seq)
] ]
block_tables.append(block_table) block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda") block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
# Create the KV cache. # Create the KV cache.
key_caches, value_caches = create_kv_caches_with_random( key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
NUM_BLOCKS, block_size, 1, num_kv_heads, head_size, kv_cache_dtype, block_size,
dtype) 1,
num_kv_heads,
head_size,
kv_cache_dtype,
dtype,
device=device)
key_cache, value_cache = key_caches[0], value_caches[0] key_cache, value_cache = key_caches[0], value_caches[0]
# Prepare for the paged attention kernel. # Prepare for the paged attention kernel.
...@@ -84,7 +91,7 @@ def main( ...@@ -84,7 +91,7 @@ def main(
) )
max_logits = torch.empty_like(exp_sums) max_logits = torch.empty_like(exp_sums)
def run_benchmark(num_iters: int, profile: bool = False) -> float: def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.synchronize() torch.cuda.synchronize()
if profile: if profile:
torch.cuda.cudart().cudaProfilerStart() torch.cuda.cudart().cudaProfilerStart()
...@@ -135,6 +142,7 @@ def main( ...@@ -135,6 +142,7 @@ def main(
# Warmup. # Warmup.
print("Warming up...") print("Warming up...")
run_benchmark = run_cuda_benchmark
run_benchmark(num_iters=3, profile=False) run_benchmark(num_iters=3, profile=False)
# Benchmark. # Benchmark.
...@@ -175,6 +183,7 @@ if __name__ == '__main__': ...@@ -175,6 +183,7 @@ if __name__ == '__main__':
default="auto", default="auto",
help= help=
'Data type for kv cache storage. If "auto", will use model data type.') 'Data type for kv cache storage. If "auto", will use model data type.')
parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
......
...@@ -7,26 +7,29 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] ...@@ -7,26 +7,29 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 4096, 5120, 13824] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_and_mul( def test_silu_and_mul(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id) torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
layer = SiluAndMul() layer = SiluAndMul()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
...@@ -37,19 +40,20 @@ def test_silu_and_mul( ...@@ -37,19 +40,20 @@ def test_silu_and_mul(
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_gelu_new( def test_gelu_new(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype)
layer = NewGELU() layer = NewGELU()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
...@@ -60,18 +64,19 @@ def test_gelu_new( ...@@ -60,18 +64,19 @@ def test_gelu_new(
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_gelu_fast( def test_gelu_fast(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype)
layer = FastGELU() layer = FastGELU()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
......
...@@ -27,7 +27,9 @@ BLOCK_SIZES = [16, 32] ...@@ -27,7 +27,9 @@ BLOCK_SIZES = [16, 32]
USE_ALIBI = [False, True] USE_ALIBI = [False, True]
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
def ref_masked_attention( def ref_masked_attention(
...@@ -91,7 +93,7 @@ def ref_single_query_cached_kv_attention( ...@@ -91,7 +93,7 @@ def ref_single_query_cached_kv_attention(
alibi_bias = None alibi_bias = None
if alibi_slopes is not None: if alibi_slopes is not None:
# Create the ALiBi bias used in the paged attention kernel. # Create the ALiBi bias used in the paged attention kernel.
position_ids = torch.arange(context_len, device=query.device).int() position_ids = torch.arange(context_len).int()
alibi_bias = (position_ids - context_len + 1).float() alibi_bias = (position_ids - context_len + 1).float()
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
1, 1, -1) 1, 1, -1)
...@@ -110,7 +112,7 @@ def ref_single_query_cached_kv_attention( ...@@ -110,7 +112,7 @@ def ref_single_query_cached_kv_attention(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_paged_attention( def test_paged_attention(
kv_cache_factory, kv_cache_factory,
version: str, version: str,
...@@ -122,33 +124,28 @@ def test_paged_attention( ...@@ -122,33 +124,28 @@ def test_paged_attention(
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str, kv_cache_dtype: str,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
query = torch.empty(num_seqs, query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
num_query_heads,
head_size,
dtype=dtype,
device=gpu_id)
query.uniform_(-scale, scale) query.uniform_(-scale, scale)
assert num_query_heads % num_kv_heads == 0 assert num_query_heads % num_kv_heads == 0
num_queries_per_kv = num_query_heads // num_kv_heads num_queries_per_kv = num_query_heads // num_kv_heads
alibi_slopes = None alibi_slopes = None
if use_alibi: if use_alibi:
alibi_slopes = torch.randn(num_query_heads, alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
dtype=torch.float,
device=gpu_id)
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
context_lens[-1] = MAX_SEQ_LEN context_lens[-1] = MAX_SEQ_LEN
max_context_len = max(context_lens) max_context_len = max(context_lens)
context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id) context_lens = torch.tensor(context_lens, dtype=torch.int)
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
...@@ -159,13 +156,13 @@ def test_paged_attention( ...@@ -159,13 +156,13 @@ def test_paged_attention(
for _ in range(max_num_blocks_per_seq) for _ in range(max_num_blocks_per_seq)
] ]
block_tables.append(block_table) block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id) block_tables = torch.tensor(block_tables, dtype=torch.int)
# Create the KV caches. # Create the KV caches.
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
num_kv_heads, head_size, num_kv_heads, head_size,
kv_cache_dtype, dtype, seed, kv_cache_dtype, dtype, seed,
gpu_id) device)
key_cache, value_cache = key_caches[0], value_caches[0] key_cache, value_cache = key_caches[0], value_caches[0]
# Call the paged attention kernel. # Call the paged attention kernel.
...@@ -193,12 +190,10 @@ def test_paged_attention( ...@@ -193,12 +190,10 @@ def test_paged_attention(
tmp_output = torch.empty( tmp_output = torch.empty(
size=(num_seqs, num_heads, num_partitions, head_size), size=(num_seqs, num_heads, num_partitions, head_size),
dtype=output.dtype, dtype=output.dtype,
device=output.device,
) )
exp_sums = torch.empty( exp_sums = torch.empty(
size=(num_seqs, num_heads, num_partitions), size=(num_seqs, num_heads, num_partitions),
dtype=torch.float32, dtype=torch.float32,
device=output.device,
) )
max_logits = torch.empty_like(exp_sums) max_logits = torch.empty_like(exp_sums)
ops.paged_attention_v2( ops.paged_attention_v2(
...@@ -229,14 +224,14 @@ def test_paged_attention( ...@@ -229,14 +224,14 @@ def test_paged_attention(
block_size, x) block_size, x)
dequantized_key_cache = torch.empty(size=key_cache_shape, dequantized_key_cache = torch.empty(size=key_cache_shape,
dtype=dtype, dtype=dtype,
device=gpu_id) device=device)
cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache) cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
key_cache = dequantized_key_cache key_cache = dequantized_key_cache
value_cache_shape = value_cache.shape value_cache_shape = value_cache.shape
dequantized_value_cache = torch.empty(size=value_cache_shape, dequantized_value_cache = torch.empty(size=value_cache_shape,
dtype=dtype, dtype=dtype,
device=gpu_id) device=device)
cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache) cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
value_cache = dequantized_value_cache value_cache = dequantized_value_cache
...@@ -283,7 +278,7 @@ def ref_multi_query_kv_attention( ...@@ -283,7 +278,7 @@ def ref_multi_query_kv_attention(
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
diagonal=1) diagonal=1)
attn_mask = attn_mask * torch.finfo(dtype).min attn_mask = attn_mask * torch.finfo(dtype).min
attn_mask = attn_mask.to(dtype=dtype, device=query.device) attn_mask = attn_mask.to(dtype=dtype)
ref_output = ref_masked_attention( ref_output = ref_masked_attention(
query[start_idx:end_idx], query[start_idx:end_idx],
...@@ -303,7 +298,7 @@ def ref_multi_query_kv_attention( ...@@ -303,7 +298,7 @@ def ref_multi_query_kv_attention(
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_multi_query_kv_attention( def test_multi_query_kv_attention(
num_seqs: int, num_seqs: int,
...@@ -311,12 +306,13 @@ def test_multi_query_kv_attention( ...@@ -311,12 +306,13 @@ def test_multi_query_kv_attention(
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use # As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here. # a smaller MAX_SEQ_LEN here.
...@@ -329,8 +325,7 @@ def test_multi_query_kv_attention( ...@@ -329,8 +325,7 @@ def test_multi_query_kv_attention(
qkv = torch.empty(num_tokens, qkv = torch.empty(num_tokens,
num_query_heads + 2 * num_kv_heads, num_query_heads + 2 * num_kv_heads,
head_size, head_size,
dtype=dtype, dtype=dtype)
device=gpu_id)
qkv.uniform_(-scale, scale) qkv.uniform_(-scale, scale)
query, key, value = qkv.split( query, key, value = qkv.split(
[num_query_heads, num_kv_heads, num_kv_heads], dim=1) [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
......
...@@ -17,7 +17,9 @@ BLOCK_SIZES = [8, 16, 32] ...@@ -17,7 +17,9 @@ BLOCK_SIZES = [8, 16, 32]
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
NUM_MAPPINGS = [256] # Arbitrary values for testing NUM_MAPPINGS = [256] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
...@@ -29,7 +31,7 @@ KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] ...@@ -29,7 +31,7 @@ KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode() @torch.inference_mode()
def test_copy_blocks( def test_copy_blocks(
...@@ -42,13 +44,14 @@ def test_copy_blocks( ...@@ -42,13 +44,14 @@ def test_copy_blocks(
num_blocks: int, num_blocks: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
kv_cache_dtype: str, kv_cache_dtype: str,
device: str,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
torch.set_default_device(device)
# Generate random block mappings where each source block is mapped to two # Generate random block mappings where each source block is mapped to two
# destination blocks. # destination blocks.
assert 2 * num_mappings <= num_blocks assert 2 * num_mappings <= num_blocks
...@@ -66,7 +69,7 @@ def test_copy_blocks( ...@@ -66,7 +69,7 @@ def test_copy_blocks(
key_caches, value_caches = kv_cache_factory(num_blocks, block_size, key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
num_layers, num_heads, num_layers, num_heads,
head_size, kv_cache_dtype, head_size, kv_cache_dtype,
dtype, seed, gpu_id) dtype, seed, device)
# Clone the KV caches. # Clone the KV caches.
cloned_key_caches = [key_cache.clone() for key_cache in key_caches] cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
...@@ -98,7 +101,7 @@ def test_copy_blocks( ...@@ -98,7 +101,7 @@ def test_copy_blocks(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_reshape_and_cache( def test_reshape_and_cache(
kv_cache_factory, kv_cache_factory,
...@@ -109,29 +112,25 @@ def test_reshape_and_cache( ...@@ -109,29 +112,25 @@ def test_reshape_and_cache(
num_blocks: int, num_blocks: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
torch.set_default_device(device)
# Create a random slot mapping. # Create a random slot mapping.
num_slots = block_size * num_blocks num_slots = block_size * num_blocks
slot_mapping = random.sample(range(num_slots), num_tokens) slot_mapping = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id) slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
qkv = torch.randn(num_tokens, qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
3,
num_heads,
head_size,
dtype=dtype,
device=gpu_id)
_, key, value = qkv.unbind(dim=1) _, key, value = qkv.unbind(dim=1)
# Create the KV caches. # Create the KV caches.
key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
num_heads, head_size, dtype, num_heads, head_size, dtype,
None, seed, gpu_id) None, seed, device)
key_cache, value_cache = key_caches[0], value_caches[0] key_cache, value_cache = key_caches[0], value_caches[0]
# Clone the KV caches. # Clone the KV caches.
...@@ -166,7 +165,7 @@ def test_reshape_and_cache( ...@@ -166,7 +165,7 @@ def test_reshape_and_cache(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_swap_blocks( def test_swap_blocks(
kv_cache_factory, kv_cache_factory,
...@@ -182,7 +181,8 @@ def test_swap_blocks( ...@@ -182,7 +181,8 @@ def test_swap_blocks(
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
src_device = f"{direction[0]}:{device}" if direction[ src_device = f"{direction[0]}:{device}" if direction[
0] == "cuda" else direction[0] 0] == "cuda" else direction[0]
dst_device = f"{direction[1]}:{device}" if direction[ dst_device = f"{direction[1]}:{device}" if direction[
......
...@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing ...@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing
ADD_RESIDUAL = [False, True] ADD_RESIDUAL = [False, True]
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
...@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] ...@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_rms_norm( def test_rms_norm(
num_tokens: int, num_tokens: int,
...@@ -24,15 +26,16 @@ def test_rms_norm( ...@@ -24,15 +26,16 @@ def test_rms_norm(
add_residual: bool, add_residual: bool,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id) torch.set_default_device(device)
layer = RMSNorm(hidden_size).to(dtype=dtype)
layer.weight.data.normal_(mean=1.0, std=0.1) layer.weight.data.normal_(mean=1.0, std=0.1)
scale = 1 / (2 * hidden_size) scale = 1 / (2 * hidden_size)
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id) x = torch.randn(num_tokens, hidden_size, dtype=dtype)
x *= scale x *= scale
residual = torch.randn_like(x) * scale if add_residual else None residual = torch.randn_like(x) * scale if add_residual else None
......
...@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17] # Arbitrary values for testing ...@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17] # Arbitrary values for testing
BATCH_SIZES = [1, 5] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing
SEQ_LENS = [11, 8192] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
...@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] ...@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_rotary_embedding( def test_rotary_embedding(
is_neox_style: bool, is_neox_style: bool,
...@@ -35,28 +37,26 @@ def test_rotary_embedding( ...@@ -35,28 +37,26 @@ def test_rotary_embedding(
rotary_dim: Optional[int], rotary_dim: Optional[int],
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int, device: str,
max_position: int = 8192, max_position: int = 8192,
base: int = 10000, base: int = 10000,
) -> None: ) -> None:
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
gpu_id = f"cuda:{device}" torch.cuda.manual_seed(seed)
torch.set_default_device(device)
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
rope = rope.to(dtype=dtype, device=gpu_id) rope = rope.to(dtype=dtype)
positions = torch.randint(0, positions = torch.randint(0, max_position, (batch_size, seq_len))
max_position, (batch_size, seq_len),
device=gpu_id)
query = torch.randn(batch_size, query = torch.randn(batch_size,
seq_len, seq_len,
num_heads * head_size, num_heads * head_size,
dtype=dtype, dtype=dtype)
device=gpu_id)
key = torch.randn_like(query) key = torch.randn_like(query)
# NOTE(woosuk): The reference implementation should be executed first # NOTE(woosuk): The reference implementation should be executed first
......
...@@ -11,19 +11,27 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask ...@@ -11,19 +11,27 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
NUM_HEADS = [12] NUM_HEADS = [12]
HEAD_SIZES = [128] HEAD_SIZES = [128]
DTYPES = [torch.float16] DTYPES = [torch.float16]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_contexted_kv_attention( def test_contexted_kv_attention(
num_heads: int, num_heads: int,
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
device: str,
) -> None: ) -> None:
random.seed(0) random.seed(0)
torch.manual_seed(0) torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
torch.set_default_device(device)
MAX_SEQ_LEN = 1024 MAX_SEQ_LEN = 1024
MAX_CTX_LEN = 1024 MAX_CTX_LEN = 1024
BS = 10 BS = 10
...@@ -35,24 +43,11 @@ def test_contexted_kv_attention( ...@@ -35,24 +43,11 @@ def test_contexted_kv_attention(
seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)] seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
num_tokens = sum(subquery_lens) num_tokens = sum(subquery_lens)
query = torch.empty(num_tokens, query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
num_heads,
head_size,
dtype=dtype,
device='cuda')
query.uniform_(-1e-3, 1e-3) query.uniform_(-1e-3, 1e-3)
output = torch.empty(num_tokens, output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
num_heads,
head_size,
dtype=dtype,
device='cuda')
kv = torch.empty(sum(seq_lens), kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
2,
num_heads,
head_size,
dtype=dtype,
device='cuda')
kv.uniform_(-1e-3, 1e-3) kv.uniform_(-1e-3, 1e-3)
key, value = kv.unbind(dim=1) key, value = kv.unbind(dim=1)
...@@ -60,39 +55,27 @@ def test_contexted_kv_attention( ...@@ -60,39 +55,27 @@ def test_contexted_kv_attention(
block_size, block_size,
num_heads, num_heads,
head_size, head_size,
dtype=dtype, dtype=dtype)
device='cuda')
v_cache = torch.zeros(cache_size, v_cache = torch.zeros(cache_size,
block_size, block_size,
num_heads, num_heads,
head_size, head_size,
dtype=dtype, dtype=dtype)
device='cuda') k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
k = torch.zeros(sum(subquery_lens), v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
num_heads, values = torch.arange(0, cache_size, dtype=torch.long)
head_size,
dtype=dtype,
device='cuda')
v = torch.zeros(sum(subquery_lens),
num_heads,
head_size,
dtype=dtype,
device='cuda')
values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
values = values[torch.randperm(cache_size)] values = values[torch.randperm(cache_size)]
block_table = values[:BS * max_block_per_request].view( block_table = values[:BS * max_block_per_request].view(
BS, max_block_per_request) BS, max_block_per_request)
b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda') b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda') b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1], b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
dtype=torch.long, dtype=torch.long),
device='cuda'),
dim=0) dim=0)
max_input_len = MAX_SEQ_LEN max_input_len = MAX_SEQ_LEN
# copy kv to cache # copy kv to cache
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
dtype=torch.long, dtype=torch.long),
device='cuda'),
dim=0) dim=0)
for i in range(BS): for i in range(BS):
for j in range(subquery_lens[i]): for j in range(subquery_lens[i]):
......
...@@ -126,8 +126,8 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module: ...@@ -126,8 +126,8 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
cleanup() cleanup()
get_model_old = get_model get_model_old = get_model
def get_model_patched(model_config, lora_config=None): def get_model_patched(model_config, device_config, lora_config=None):
return get_model_old(model_config, return get_model_old(model_config, device_config,
LoRAConfig(max_loras=4, max_lora_rank=8)) LoRAConfig(max_loras=4, max_lora_rank=8))
with patch("vllm.worker.model_runner.get_model", get_model_patched): with patch("vllm.worker.model_runner.get_model", get_model_patched):
......
...@@ -34,6 +34,9 @@ TOLERANCES = { ...@@ -34,6 +34,9 @@ TOLERANCES = {
torch.float32: (5e-3, 5e-3), torch.float32: (5e-3, 5e-3),
torch.bfloat16: (3e-2, 2e-2), torch.bfloat16: (3e-2, 2e-2),
} }
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
def get_random_id_to_index(num_loras: int, def get_random_id_to_index(num_loras: int,
...@@ -151,14 +154,10 @@ def create_random_inputs( ...@@ -151,14 +154,10 @@ def create_random_inputs(
for _ in range(num_inputs): for _ in range(num_inputs):
if input_type == torch.int: if input_type == torch.int:
inputs.append( inputs.append(
torch.randint(low=int(low), torch.randint(low=int(low), high=int(high), size=input_size))
high=int(high),
size=input_size,
device="cuda"))
else: else:
inputs.append( inputs.append(
torch.rand(size=input_size, dtype=input_type, device="cuda") * torch.rand(size=input_size, dtype=input_type) * high + low)
high + low)
lora_id = random.choice(active_lora_ids) lora_id = random.choice(active_lora_ids)
index_mapping += [lora_id] * input_size[0] index_mapping += [lora_id] * input_size[0]
...@@ -169,8 +168,10 @@ def create_random_inputs( ...@@ -169,8 +168,10 @@ def create_random_inputs(
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
def test_embeddings(dist_init, num_loras) -> None: @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_embeddings(dist_init, num_loras, device) -> None:
torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
...@@ -259,8 +260,10 @@ def test_embeddings(dist_init, num_loras) -> None: ...@@ -259,8 +260,10 @@ def test_embeddings(dist_init, num_loras) -> None:
@torch.inference_mode() @torch.inference_mode()
# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.") # @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None: @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
...@@ -305,8 +308,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None: ...@@ -305,8 +308,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
# Add empty embeddings_tensors for unoccupied lora slots. # Add empty embeddings_tensors for unoccupied lora slots.
for _ in range(max_loras - len(embeddings_tensors)): for _ in range(max_loras - len(embeddings_tensors)):
embeddings_tensors.append( embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
torch.zeros(embeddings_tensors[0].shape, device="cuda"))
inputs, index_mapping, prompt_mapping = create_random_inputs( inputs, index_mapping, prompt_mapping = create_random_inputs(
active_lora_ids=list(lora_dict.keys()), active_lora_ids=list(lora_dict.keys()),
...@@ -388,8 +390,10 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None: ...@@ -388,8 +390,10 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
def test_lm_head_sampler(dist_init, num_loras) -> None: @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_lm_head_sampler(dist_init, num_loras, device) -> None:
torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
...@@ -432,7 +436,7 @@ def test_lm_head_sampler(dist_init, num_loras) -> None: ...@@ -432,7 +436,7 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
) )
lora_mapping = LoRAMapping(index_mapping, prompt_mapping) lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
input_ = torch.rand(20, 1024, device="cuda") input_ = torch.rand(20, 1024)
mapping_info = convert_mapping( mapping_info = convert_mapping(
lora_mapping, lora_mapping,
id_to_index, id_to_index,
...@@ -500,8 +504,10 @@ def test_lm_head_sampler(dist_init, num_loras) -> None: ...@@ -500,8 +504,10 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("orientation", ["row", "column"]) @pytest.mark.parametrize("orientation", ["row", "column"])
def test_linear_parallel(dist_init, num_loras, orientation) -> None: @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
...@@ -597,8 +603,10 @@ def test_linear_parallel(dist_init, num_loras, orientation) -> None: ...@@ -597,8 +603,10 @@ def test_linear_parallel(dist_init, num_loras, orientation) -> None:
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("repeats", [2, 3]) @pytest.mark.parametrize("repeats", [2, 3])
def test_column_parallel_packed(dist_init, num_loras, repeats) -> None: @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
......
...@@ -5,7 +5,8 @@ from unittest.mock import patch ...@@ -5,7 +5,8 @@ from unittest.mock import patch
from vllm.lora.models import LoRAMapping from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
DeviceConfig, LoRAConfig)
from vllm.worker.worker import Worker from vllm.worker.worker import Worker
...@@ -25,6 +26,7 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -25,6 +26,7 @@ def test_worker_apply_lora(sql_lora_files):
), ),
parallel_config=ParallelConfig(1, 1, False), parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32, 256), scheduler_config=SchedulerConfig(32, 32, 32, 256),
device_config=DeviceConfig("cuda"),
local_rank=0, local_rank=0,
rank=0, rank=0,
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
......
...@@ -9,6 +9,10 @@ from vllm.model_executor.utils import set_random_seed ...@@ -9,6 +9,10 @@ from vllm.model_executor.utils import set_random_seed
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
def mock_causal_accepted_tensor( def mock_causal_accepted_tensor(
k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor: k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
...@@ -39,11 +43,14 @@ def mock_causal_accepted_tensor( ...@@ -39,11 +43,14 @@ def mock_causal_accepted_tensor(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"which_tokens_accepted", "which_tokens_accepted",
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_correct_output_format(which_tokens_accepted: str, seed: int): def test_correct_output_format(which_tokens_accepted: str, seed: int,
device: str):
"""Verify the output has correct format given predetermined accepted matrix. """Verify the output has correct format given predetermined accepted matrix.
""" """
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = 10 batch_size = 10
k = 5 k = 5
...@@ -66,18 +73,15 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int): ...@@ -66,18 +73,15 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
recovered_token_ids = torch.randint(low=0, recovered_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, k), size=(batch_size, k),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
draft_token_ids = torch.randint(low=0, draft_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, k), size=(batch_size, k),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
bonus_token_ids = torch.randint(low=0, bonus_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, 1), size=(batch_size, 1),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
rejection_sampler = RejectionSampler() rejection_sampler = RejectionSampler()
rejection_sampler.init_gpu_tensors(rank=0) rejection_sampler.init_gpu_tensors(rank=0)
...@@ -120,31 +124,24 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int): ...@@ -120,31 +124,24 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
@pytest.mark.parametrize("k", list(range(1, 6))) @pytest.mark.parametrize("k", list(range(1, 6)))
@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", list(range(1, 32))) @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int): def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device: str):
torch.set_default_device(device)
rejection_sampler = RejectionSampler() rejection_sampler = RejectionSampler()
rejection_sampler.init_gpu_tensors(rank=0) rejection_sampler.init_gpu_tensors(rank=0)
draft_probs = torch.rand(batch_size, draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
k, target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
vocab_size,
dtype=torch.float32,
device="cuda")
target_probs = torch.rand(batch_size,
k,
vocab_size,
dtype=torch.float32,
device="cuda")
bonus_token_ids = torch.randint(low=0, bonus_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, 1), size=(batch_size, 1),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
draft_token_ids = torch.randint(low=0, draft_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, k), size=(batch_size, k),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
rejection_sampler(target_probs, bonus_token_ids, draft_probs, rejection_sampler(target_probs, bonus_token_ids, draft_probs,
draft_token_ids) draft_token_ids)
...@@ -153,36 +150,28 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int): ...@@ -153,36 +150,28 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@pytest.mark.parametrize("which_token_ids", @pytest.mark.parametrize("which_token_ids",
["bonus_token_ids", "draft_token_ids"]) ["bonus_token_ids", "draft_token_ids"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_raises_when_vocab_oob(above_or_below_vocab_range: str, def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
which_token_ids: str): which_token_ids: str, device: str):
k = 3 k = 3
batch_size = 5 batch_size = 5
vocab_size = 30_000 vocab_size = 30_000
torch.set_default_device(device)
rejection_sampler = RejectionSampler(strict_mode=True) rejection_sampler = RejectionSampler(strict_mode=True)
rejection_sampler.init_gpu_tensors(rank=0) rejection_sampler.init_gpu_tensors(rank=0)
draft_probs = torch.rand(batch_size, draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
k, target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
vocab_size,
dtype=torch.float32,
device="cuda")
target_probs = torch.rand(batch_size,
k,
vocab_size,
dtype=torch.float32,
device="cuda")
bonus_token_ids = torch.randint(low=0, bonus_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, 1), size=(batch_size, 1),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
draft_token_ids = torch.randint(low=0, draft_token_ids = torch.randint(low=0,
high=vocab_size, high=vocab_size,
size=(batch_size, k), size=(batch_size, k),
dtype=torch.int64, dtype=torch.int64)
device="cuda")
oob_token_ids = None oob_token_ids = None
if which_token_ids == "bonus_token_ids": if which_token_ids == "bonus_token_ids":
...@@ -237,6 +226,7 @@ def test_rejection_sampling_approximates_target_distribution( ...@@ -237,6 +226,7 @@ def test_rejection_sampling_approximates_target_distribution(
probabilities are exactly equal. Rejection sampling should probabilities are exactly equal. Rejection sampling should
still work without any NaNs or exceptions. still work without any NaNs or exceptions.
""" """
torch.set_default_device("cpu")
set_random_seed(seed) set_random_seed(seed)
helper = _CorrectnessTestHelper( helper = _CorrectnessTestHelper(
......
...@@ -31,24 +31,26 @@ def _prepare_test( ...@@ -31,24 +31,26 @@ def _prepare_test(
batch_size: int batch_size: int
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]: ) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
vocab_size = 32000 vocab_size = 32000
input_tensor = torch.rand((batch_size, 1024), input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
device="cuda",
dtype=torch.float16)
fake_logits = torch.full((batch_size, vocab_size), fake_logits = torch.full((batch_size, vocab_size),
1e-2, 1e-2,
device=input_tensor.device,
dtype=input_tensor.dtype) dtype=input_tensor.dtype)
sampler = MockLogitsSampler(32000, fake_logits) sampler = MockLogitsSampler(32000, fake_logits)
model_runner = ModelRunner(None, None, None, None) model_runner = ModelRunner(None, None, None, None, None)
return input_tensor, fake_logits, sampler, model_runner return input_tensor, fake_logits, sampler, model_runner
RANDOM_SEEDS = list(range(128)) RANDOM_SEEDS = list(range(128))
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_all_greedy(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_all_greedy(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler, model_runner = _prepare_test( input_tensor, fake_logits, sampler, model_runner = _prepare_test(
batch_size) batch_size)
...@@ -81,8 +83,10 @@ def test_sampler_all_greedy(seed: int): ...@@ -81,8 +83,10 @@ def test_sampler_all_greedy(seed: int):
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_all_random(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_all_random(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler, model_runner = _prepare_test( input_tensor, fake_logits, sampler, model_runner = _prepare_test(
batch_size) batch_size)
...@@ -120,8 +124,10 @@ def test_sampler_all_random(seed: int): ...@@ -120,8 +124,10 @@ def test_sampler_all_random(seed: int):
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_all_beam(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_all_beam(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, _, sampler, model_runner = _prepare_test(batch_size) input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
...@@ -156,8 +162,10 @@ def test_sampler_all_beam(seed: int): ...@@ -156,8 +162,10 @@ def test_sampler_all_beam(seed: int):
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_mixed(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_mixed(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, fake_logits, sampler, model_runner = _prepare_test( input_tensor, fake_logits, sampler, model_runner = _prepare_test(
batch_size) batch_size)
...@@ -212,8 +220,10 @@ def test_sampler_mixed(seed: int): ...@@ -212,8 +220,10 @@ def test_sampler_mixed(seed: int):
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_logits_processors(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_logits_processors(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
torch.set_default_device(device)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
input_tensor, _, sampler, model_runner = _prepare_test(batch_size) input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
...@@ -252,14 +262,15 @@ def test_sampler_logits_processors(seed: int): ...@@ -252,14 +262,15 @@ def test_sampler_logits_processors(seed: int):
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_top_k_top_p(seed: int): @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_sampler_top_k_top_p(seed: int, device: str):
set_random_seed(seed) set_random_seed(seed)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
top_k = random.randint(100, 500) top_k = random.randint(100, 500)
top_p = random.random() * 0.1 top_p = random.random() * 0.1
vocab_size = 32000 vocab_size = 32000
input_tensor = torch.rand((batch_size, 1024), input_tensor = torch.rand((batch_size, 1024),
device="cuda", device=device,
dtype=torch.float16) dtype=torch.float16)
fake_logits = torch.normal(0, fake_logits = torch.normal(0,
5, 5,
...@@ -267,7 +278,7 @@ def test_sampler_top_k_top_p(seed: int): ...@@ -267,7 +278,7 @@ def test_sampler_top_k_top_p(seed: int):
device=input_tensor.device, device=input_tensor.device,
dtype=input_tensor.dtype) dtype=input_tensor.dtype)
sampler = MockLogitsSampler(32000, fake_logits) sampler = MockLogitsSampler(32000, fake_logits)
model_runner = ModelRunner(None, None, None, None) model_runner = ModelRunner(None, None, None, None, None)
generation_model = GenerationMixin() generation_model = GenerationMixin()
generation_config = GenerationConfig(top_k=top_k, generation_config = GenerationConfig(top_k=top_k,
......
...@@ -84,7 +84,7 @@ def create_worker(cls: type, ...@@ -84,7 +84,7 @@ def create_worker(cls: type,
) )
(model_config, cache_config, parallel_config, scheduler_config, (model_config, cache_config, parallel_config, scheduler_config,
_) = engine_args.create_engine_configs() device_config, _) = engine_args.create_engine_configs()
distributed_init_method = get_distributed_init_method( distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port()) get_ip(), get_open_port())
...@@ -93,6 +93,7 @@ def create_worker(cls: type, ...@@ -93,6 +93,7 @@ def create_worker(cls: type,
model_config=model_config, model_config=model_config,
parallel_config=parallel_config, parallel_config=parallel_config,
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
device_config=device_config,
local_rank=0, local_rank=0,
rank=0, rank=0,
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
......
...@@ -6,7 +6,7 @@ from vllm.worker.model_runner import ModelRunner ...@@ -6,7 +6,7 @@ from vllm.worker.model_runner import ModelRunner
def test_prepare_prompt(): def test_prepare_prompt():
model_runner = ModelRunner(None, None, None, None) model_runner = ModelRunner(None, None, None, None, None)
model_runner.set_block_size(16) model_runner.set_block_size(16)
batch_size = random.randint(1, 256) batch_size = random.randint(1, 256)
......
...@@ -444,6 +444,12 @@ class SchedulerConfig: ...@@ -444,6 +444,12 @@ class SchedulerConfig:
f"({self.max_num_seqs}).") f"({self.max_num_seqs}).")
class DeviceConfig:
def __init__(self, device: str = "cuda") -> None:
self.device = torch.device(device)
@dataclass @dataclass
class LoRAConfig: class LoRAConfig:
max_lora_rank: int max_lora_rank: int
......
...@@ -3,8 +3,8 @@ import dataclasses ...@@ -3,8 +3,8 @@ import dataclasses
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Optional, Tuple
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
SchedulerConfig, LoRAConfig) ParallelConfig, SchedulerConfig, LoRAConfig)
@dataclass @dataclass
...@@ -43,6 +43,7 @@ class EngineArgs: ...@@ -43,6 +43,7 @@ class EngineArgs:
lora_extra_vocab_size: int = 256 lora_extra_vocab_size: int = 256
lora_dtype = 'auto' lora_dtype = 'auto'
max_cpu_loras: Optional[int] = None max_cpu_loras: Optional[int] = None
device: str = 'cuda'
def __post_init__(self): def __post_init__(self):
if self.tokenizer is None: if self.tokenizer is None:
...@@ -127,13 +128,13 @@ class EngineArgs: ...@@ -127,13 +128,13 @@ class EngineArgs:
'--kv-cache-dtype', '--kv-cache-dtype',
type=str, type=str,
choices=['auto', 'fp8_e5m2'], choices=['auto', 'fp8_e5m2'],
default='auto', default=EngineArgs.kv_cache_dtype,
help='Data type for kv cache storage. If "auto", will use model ' help='Data type for kv cache storage. If "auto", will use model '
'data type. Note FP8 is not supported when cuda version is ' 'data type. Note FP8 is not supported when cuda version is '
'lower than 11.8.') 'lower than 11.8.')
parser.add_argument('--max-model-len', parser.add_argument('--max-model-len',
type=int, type=int,
default=None, default=EngineArgs.max_model_len,
help='model context length. If unspecified, ' help='model context length. If unspecified, '
'will be automatically derived from the model.') 'will be automatically derived from the model.')
# Parallel arguments # Parallel arguments
...@@ -154,6 +155,7 @@ class EngineArgs: ...@@ -154,6 +155,7 @@ class EngineArgs:
parser.add_argument( parser.add_argument(
'--max-parallel-loading-workers', '--max-parallel-loading-workers',
type=int, type=int,
default=EngineArgs.max_parallel_loading_workers,
help='load model sequentially in multiple batches, ' help='load model sequentially in multiple batches, '
'to avoid RAM OOM when using tensor ' 'to avoid RAM OOM when using tensor '
'parallel and large models') 'parallel and large models')
...@@ -200,7 +202,7 @@ class EngineArgs: ...@@ -200,7 +202,7 @@ class EngineArgs:
'-q', '-q',
type=str, type=str,
choices=['awq', 'gptq', 'squeezellm', None], choices=['awq', 'gptq', 'squeezellm', None],
default=None, default=EngineArgs.quantization,
help='Method used to quantize the weights. If ' help='Method used to quantize the weights. If '
'None, we first check the `quantization_config` ' 'None, we first check the `quantization_config` '
'attribute in the model config file. If that is ' 'attribute in the model config file. If that is '
...@@ -255,6 +257,13 @@ class EngineArgs: ...@@ -255,6 +257,13 @@ class EngineArgs:
help=('Maximum number of LoRAs to store in CPU memory. ' help=('Maximum number of LoRAs to store in CPU memory. '
'Must be >= than max_num_seqs. ' 'Must be >= than max_num_seqs. '
'Defaults to max_num_seqs.')) 'Defaults to max_num_seqs.'))
parser.add_argument(
"--device",
type=str,
default=EngineArgs.device,
choices=["cuda"],
help=('Device type for vLLM execution. '
'Currently, only CUDA-compatible devices are supported.'))
return parser return parser
@classmethod @classmethod
...@@ -268,7 +277,8 @@ class EngineArgs: ...@@ -268,7 +277,8 @@ class EngineArgs:
def create_engine_configs( def create_engine_configs(
self, self,
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
Optional[LoRAConfig]]: DeviceConfig, Optional[LoRAConfig]]:
device_config = DeviceConfig(self.device)
model_config = ModelConfig(self.model, self.tokenizer, model_config = ModelConfig(self.model, self.tokenizer,
self.tokenizer_mode, self.trust_remote_code, self.tokenizer_mode, self.trust_remote_code,
self.download_dir, self.load_format, self.download_dir, self.load_format,
...@@ -296,7 +306,8 @@ class EngineArgs: ...@@ -296,7 +306,8 @@ class EngineArgs:
lora_dtype=self.lora_dtype, lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
and self.max_cpu_loras > 0 else None) if self.enable_lora else None and self.max_cpu_loras > 0 else None) if self.enable_lora else None
return model_config, cache_config, parallel_config, scheduler_config, lora_config return (model_config, cache_config, parallel_config, scheduler_config,
device_config, lora_config)
@dataclass @dataclass
......
...@@ -6,8 +6,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, ...@@ -6,8 +6,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
Union) Union)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
SchedulerConfig, LoRAConfig) ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import StatLogger, Stats from vllm.engine.metrics import StatLogger, Stats
...@@ -53,6 +53,7 @@ class LLMEngine: ...@@ -53,6 +53,7 @@ class LLMEngine:
management. management.
parallel_config: The configuration related to distributed execution. parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler. scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
placement_group: Ray placement group for distributed execution. placement_group: Ray placement group for distributed execution.
Required for distributed execution. Required for distributed execution.
log_stats: Whether to log statistics. log_stats: Whether to log statistics.
...@@ -64,6 +65,7 @@ class LLMEngine: ...@@ -64,6 +65,7 @@ class LLMEngine:
cache_config: CacheConfig, cache_config: CacheConfig,
parallel_config: ParallelConfig, parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig, scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig], lora_config: Optional[LoRAConfig],
placement_group: Optional["PlacementGroup"], placement_group: Optional["PlacementGroup"],
log_stats: bool, log_stats: bool,
...@@ -85,6 +87,7 @@ class LLMEngine: ...@@ -85,6 +87,7 @@ class LLMEngine:
f"quantization={model_config.quantization}, " f"quantization={model_config.quantization}, "
f"enforce_eager={model_config.enforce_eager}, " f"enforce_eager={model_config.enforce_eager}, "
f"kv_cache_dtype={cache_config.cache_dtype}, " f"kv_cache_dtype={cache_config.cache_dtype}, "
f"device_config={device_config.device}, "
f"seed={model_config.seed})") f"seed={model_config.seed})")
# TODO(woosuk): Print more configs in debug mode. # TODO(woosuk): Print more configs in debug mode.
...@@ -93,6 +96,7 @@ class LLMEngine: ...@@ -93,6 +96,7 @@ class LLMEngine:
self.lora_config = lora_config self.lora_config = lora_config
self.parallel_config = parallel_config self.parallel_config = parallel_config
self.scheduler_config = scheduler_config self.scheduler_config = scheduler_config
self.device_config = device_config
self.log_stats = log_stats self.log_stats = log_stats
self._verify_args() self._verify_args()
...@@ -138,6 +142,7 @@ class LLMEngine: ...@@ -138,6 +142,7 @@ class LLMEngine:
self.model_config, self.model_config,
self.parallel_config, self.parallel_config,
self.scheduler_config, self.scheduler_config,
self.device_config,
local_rank=0, local_rank=0,
rank=0, rank=0,
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
...@@ -233,6 +238,7 @@ class LLMEngine: ...@@ -233,6 +238,7 @@ class LLMEngine:
model_config = copy.deepcopy(self.model_config) model_config = copy.deepcopy(self.model_config)
parallel_config = copy.deepcopy(self.parallel_config) parallel_config = copy.deepcopy(self.parallel_config)
scheduler_config = copy.deepcopy(self.scheduler_config) scheduler_config = copy.deepcopy(self.scheduler_config)
device_config = copy.deepcopy(self.device_config)
for rank, (worker, (node_id, for rank, (worker, (node_id,
_)) in enumerate(zip(self.workers, _)) in enumerate(zip(self.workers,
...@@ -244,6 +250,7 @@ class LLMEngine: ...@@ -244,6 +250,7 @@ class LLMEngine:
model_config, model_config,
parallel_config, parallel_config,
scheduler_config, scheduler_config,
device_config,
local_rank, local_rank,
rank, rank,
distributed_init_method, distributed_init_method,
...@@ -257,6 +264,7 @@ class LLMEngine: ...@@ -257,6 +264,7 @@ class LLMEngine:
model_config, model_config,
parallel_config, parallel_config,
scheduler_config, scheduler_config,
device_config,
driver_local_rank, driver_local_rank,
driver_rank, driver_rank,
distributed_init_method, distributed_init_method,
......
...@@ -89,9 +89,7 @@ class ScaledActivation(nn.Module): ...@@ -89,9 +89,7 @@ class ScaledActivation(nn.Module):
if params_dtype is None: if params_dtype is None:
params_dtype = torch.get_default_dtype() params_dtype = torch.get_default_dtype()
self.scales = nn.Parameter( self.scales = nn.Parameter(
torch.empty(intermediate_size_per_partition, torch.empty(intermediate_size_per_partition, dtype=params_dtype))
dtype=params_dtype,
device="cuda"))
set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
def forward(self, x: torch.Tensor) -> torch.Tensor: def forward(self, x: torch.Tensor) -> torch.Tensor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment