"tests/vscode:/vscode.git/clone" did not exist on "0d7c3cb51dcd0b90c203b7d36f15442a45b7e4b5"
Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.model_executor.layers.quantization.neuron_quant import (
NeuronQuantConfig)
def test_get_supported_act_dtypes():
neuron_quant_config = NeuronQuantConfig()
supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
target_list = ["any_dtype1", "any_dtype2"]
for dtype in target_list:
assert dtype in supported_act_dtypes
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
import torch.nn.functional as F
from vllm.utils import cdiv
class BlockDiagonalCausalFromBottomRightMask:
@staticmethod
def _from_seqlens(query_lens, seq_lens, block_size=None):
from torch import logical_and, logical_or
contexted = block_size is None
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
n_queries = sum(query_lens)
num_seqs = len(query_lens)
if contexted:
key_lens_blockaligned = seq_lens
else:
n_blocks_per_seq = (context_lens + block_size - 1) // block_size
offset_per_seq = n_blocks_per_seq * block_size
key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
n_keys = sum(key_lens_blockaligned)
a = (torch.arange(n_queries).reshape(n_queries,
1).expand(n_queries, n_keys))
b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
prior_mask = torch.zeros(n_queries, n_keys)
new_masks: list[torch.Tensor] = []
for seq_id in range(num_seqs):
ri = q_cumsum[seq_id]
ci = k_cumsum[seq_id]
nr = query_lens[seq_id]
if contexted:
nc = seq_lens[seq_id]
a_offset = ci + nc - ri - nr
new_mask = (a + a_offset) >= b
else:
nc = context_lens[seq_id]
a_offset = ci + nc - 1
new_mask = a_offset >= b
left_mask = b >= ci
top_mask = a >= ri
bottom_mask = a < (ri + nr)
new_mask = logical_and(
logical_and(logical_and(new_mask, left_mask), top_mask),
bottom_mask,
)
prior_mask = logical_or(prior_mask, new_mask)
new_masks = new_masks + [new_mask]
return prior_mask
@staticmethod
def from_seqlens(query_lens, seq_lens, block_size=None):
contexted = block_size is None
if contexted:
prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
query_lens, seq_lens)
active_mask = None
else:
prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
query_lens, seq_lens, block_size)
active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
query_lens, query_lens)
return prior_mask, active_mask
def ref_softmax(x: torch.Tensor,
dim: int,
mixed_precision=False,
return_max_reduce=False):
max_value = torch.amax(x, dim=dim, keepdims=True)
exp = torch.exp(x - max_value)
if mixed_precision:
sum_value = torch.sum(exp.astype(torch.float32),
dim=dim,
keepdims=True).astype(x.dtype)
else:
sum_value = torch.sum(exp, dim=dim, keepdims=True)
if return_max_reduce:
return exp / sum_value, max_value, torch.reciprocal(sum_value)
return exp / sum_value
def ref_masked_attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
scale: float,
attn_mask: Optional[torch.Tensor] = None,
return_max_reduce: Optional[bool] = False,
) -> torch.Tensor:
scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
if attn_mask is not None:
masked_score = scaled_qk + attn_mask.float()
if return_max_reduce:
norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
masked_score, dim=-1, return_max_reduce=True)
else:
norm_score = ref_softmax(masked_score, dim=-1)
out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
if return_max_reduce:
return (
out,
cached_max,
cached_sum_reciprocal,
norm_score,
masked_score,
scaled_qk,
)
else:
return (out, )
def ref_context_attention(
query,
key,
value,
query_lens,
seq_lens,
head_size,
num_queries_per_kv,
return_max_reduce=False,
):
scale = float(1.0 / (head_size**0.5))
if num_queries_per_kv > 1:
# Handle MQA and GQA
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens)
# convert binary mask to -inf values
attn_mask = torch.logical_not(attn_mask)
attn_mask = attn_mask.float() * -30000
output, *debug_tensors = ref_masked_attention(
query,
key,
value,
scale,
attn_mask,
return_max_reduce=return_max_reduce,
)
output = output.unsqueeze(1)
if return_max_reduce:
cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
debug_tensors)
return (
output,
cached_max,
cached_sum_reciprocal,
lse,
masked_score,
scaled_qk,
)
else:
return output
def sample_inputs(
prefill_batch_size,
decode_batch_size,
min_query_len,
max_query_len,
min_ctx_len,
max_ctx_len,
block_size,
num_heads,
num_kv_heads,
head_size,
dtype,
):
batch_size = prefill_batch_size + decode_batch_size
max_model_len = (max_query_len + max_ctx_len) * 4
max_block_per_request = max_model_len // block_size
cache_size = (batch_size * max_block_per_request) + 2
prefill_ctx_lens = torch.randint(min_ctx_len,
max_ctx_len + 1, (prefill_batch_size, ),
dtype=torch.long).tolist()
decode_ctx_lens = torch.randint(min_ctx_len,
max_ctx_len + 1, (decode_batch_size, ),
dtype=torch.long).tolist()
ctx_lens = prefill_ctx_lens + decode_ctx_lens
query_lens = torch.randint(
min_query_len,
max_query_len + 1,
(prefill_batch_size, ),
dtype=torch.long,
).tolist() + [1 for _ in range(decode_batch_size)]
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
num_tokens = sum(query_lens)
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
query.uniform_(-1, 1)
torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
kv.uniform_(-1, 1)
key, value = kv.unbind(dim=1)
k_cache = torch.zeros(cache_size,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
v_cache = torch.zeros(cache_size,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
values = torch.arange(0, cache_size, dtype=torch.long)
values = values[torch.randperm(cache_size)]
block_table = values[:batch_size * max_block_per_request].view(
batch_size, max_block_per_request)
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
dtype=torch.long),
dim=0)
# copy kv to cache
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
dtype=torch.long),
dim=0)
for i in range(batch_size):
for j in range(query_lens[i]):
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
j])
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
b_ctx_len[i] + j])
cur_ctx = 0
block_id = 0
while cur_ctx < b_ctx_len[i]:
start_loc = b_seq_start_loc[i] + cur_ctx
if cur_ctx + block_size > b_ctx_len[i]:
end_loc = b_seq_start_loc[i] + b_ctx_len[i]
else:
end_loc = start_loc + block_size
start_slot = block_table[i, block_id] * block_size
end_slot = start_slot + end_loc - start_loc
k_cache.view(-1, num_kv_heads,
head_size)[start_slot:end_slot].copy_(
key[start_loc:end_loc])
v_cache.view(-1, num_kv_heads,
head_size)[start_slot:end_slot].copy_(
value[start_loc:end_loc])
cur_ctx += block_size
block_id += 1
kv_cache = torch.stack([k_cache, v_cache])
return (
query,
k,
v,
kv_cache,
block_table,
key,
value,
query_lens,
seq_lens,
)
def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
num_blocks):
context_lens = seq_lens - query_lens
blocks_per_seq = (context_lens + block_size - 1) // block_size
num_seqs = len(seq_lens)
active_blocks: list[int] = []
for seq_id in range(num_seqs):
active_blocks = (
active_blocks +
block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
return F.pad(
torch.tensor(active_blocks, dtype=torch.int32),
(0, num_blocks - len(active_blocks)),
"constant",
0,
)
@pytest.mark.parametrize(
"prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
[
# Test minimal configurations (small block size)
(1, 199, 1, 512, 4, 2, 8, False
), # minimal block size, small dimensions
(1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision
# Test common/medium configurations
(4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads
(4, 12, 32, 2048, 16, 4, 32,
True), # medium size, mixed precision, grouped-query attention (GQA)
# Test large configurations
(4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size
(4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads
# Test asymmetric configurations
(2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes
(8, 8, 128, 2048, 24, 2, 48, True), # balanced batches
# Test edge cases
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
(16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
])
@torch.inference_mode()
def test_contexted_kv_attention(
monkeypatch: pytest.MonkeyPatch,
prefill_batch_size: int,
decode_batch_size: int,
num_heads: int,
num_queries_per_kv: int,
head_size: int,
block_size: int,
large_tile_size,
mixed_precision: bool,
) -> None:
import torch_xla.core.xla_model as xm
from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
reorder_context_mask)
assert large_tile_size % block_size == 0
device = xm.xla_device()
compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
])
with monkeypatch.context() as m:
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(0)
torch.set_printoptions(sci_mode=False)
torch.set_default_device("cpu")
dtype = torch.float32
min_ctx_len = 32
max_ctx_len = 1024
min_query_len = 16
max_query_len = 512
num_kv_heads = num_heads // num_queries_per_kv
(
query,
k_active,
v_active,
kv_cache,
block_table,
key,
value,
query_lens,
seq_lens,
) = sample_inputs(
prefill_batch_size=prefill_batch_size,
decode_batch_size=decode_batch_size,
min_query_len=min_query_len,
max_query_len=max_query_len,
min_ctx_len=min_ctx_len,
max_ctx_len=max_ctx_len,
block_size=block_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
)
output_ref = ref_context_attention(
query,
key,
value,
query_lens,
seq_lens,
head_size,
num_queries_per_kv,
return_max_reduce=False,
)
# build neuron program
B_P_SIZE = 128
assert (large_tile_size >= B_P_SIZE
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
def pad_to_multiple(a, b):
return cdiv(a, b) * b
def pad_to_next_power_of_2(a):
assert a > 0
return 2**int(a - 1).bit_length()
# calculate input shapes
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
num_active_blocks = cdiv(context_lens, block_size).sum().item()
num_active_blocks = pad_to_multiple(num_active_blocks,
large_tile_size // block_size)
context_kv_len = num_active_blocks * block_size
assert (
context_kv_len %
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
# pad QKV tensors
pad_dims = (
0,
0,
0,
0,
0,
max_num_queries - query.shape[0],
)
query = F.pad(query, pad_dims, "constant", 0)
k = F.pad(k_active, pad_dims, "constant", 0)
v = F.pad(v_active, pad_dims, "constant", 0)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
# transform block table
active_block_table = get_active_block_tables(
block_table.cpu(),
torch.tensor(query_lens).cpu(),
torch.tensor(seq_lens).cpu(),
block_size,
num_active_blocks,
)
# Build attention masks
prior_mask, active_mask = (
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens, block_size=block_size))
prior_mask_padded = F.pad(
prior_mask,
(
0,
context_kv_len - prior_mask.shape[1],
0,
max_num_queries - prior_mask.shape[0],
),
"constant",
0,
).bool()
active_mask_padded = F.pad(
active_mask,
(
0,
max_num_queries - active_mask.shape[1],
0,
max_num_queries - active_mask.shape[0],
),
"constant",
0,
).bool()
attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
dim=1)
attn_mask = reorder_context_mask(attn_mask, large_tile_size,
block_size)
input_args = (
query.to(device=device),
k.to(device=device),
v.to(device=device),
kv_cache.to(device=device),
active_block_table.to(device=device),
attn_mask.to(device=device),
)
input_kwargs = dict(
n_kv_head=num_kv_heads,
head_size=head_size,
mixed_precision=mixed_precision,
LARGE_TILE_SZ=large_tile_size,
)
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
num_actual_tokens = sum(query_lens)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
output_nki = output_nki[0, :num_actual_tokens, :, :]
output_ref_padded = F.pad(
output_ref,
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
"constant",
0,
)
output_ref = output_ref_padded.transpose(
0, 1)[0, :num_actual_tokens, :, :]
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for miscellaneous utilities
"""
import pytest
import torch
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
@pytest.mark.parametrize(
"max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
(16, False, 32, 32, 1024, True),
(16, False, 32, 128, 1024, True),
(16, True, 32, 32, 1024, True),
(16, True, 32, 128, 1024, True),
(16, False, 32, 128, 1024, False),
(16, True, 32, 128, 1024, False),
])
def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
head_size, seq_len, use_key):
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
batch_size = 1
base = 10000
num_heads = 8
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
is_neox_style, torch.float32)
positions = torch.randint(0,
max_position, (batch_size, seq_len),
device="cpu")
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=torch.float32,
device="cpu")
key = torch.randn_like(query) if use_key else None
assert positions.is_cpu, \
"reference input tensor is expected to be CPU tensor."
ref_query, ref_key = rot.to(device="cpu").forward_native(
positions, query, key)
out_query, out_key = rot.to(device=device).forward_neuron(
positions.to(device=device), query.to(device=device),
key.to(device=device) if key is not None else None)
if use_key:
assert out_query.is_xla and out_key.is_xla, \
"output tensor is expected to be XLA tensor"
torch.testing.assert_close(out_key.cpu(),
ref_key,
atol=1e-2,
rtol=1e-2)
else:
assert out_key is None, "expected returned key to be None"
assert out_query.is_xla, \
"output tensor is expected to be XLA tensor"
torch.testing.assert_close(out_query.cpu(),
ref_query,
atol=1e-2,
rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
from typing import Callable
from unittest.mock import patch
import pytest
import torch
import torch_xla.distributed.xla_multiprocessing as xmp
from typing_extensions import ParamSpec
from vllm.distributed.communication_op import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.utils import get_distributed_init_method, get_open_port
_P = ParamSpec("_P")
def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
"""Decorator to reinitialize the Neuron Runtime before executing a test.
This is necessary for distributed tests which need to reallocate Neuron
Cores to separate subprocesses.
"""
@functools.wraps(f)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
runtime = torch.classes.neuron.Runtime()
runtime.initialize()
runtime.unsafe_close()
f(*args, **kwargs)
runtime.initialize()
return wrapper
def all_gather_test_worker(index, tp_degree, distributed_init_method):
init_distributed_environment(tp_degree,
index,
distributed_init_method,
index,
backend="xla")
ensure_model_parallel_initialized(tp_degree, 1)
num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2))
total_size = 1
for s in tensor_size:
total_size *= s
all_gather_dimension = -1
all_tensors = [
torch.arange(total_size, dtype=torch.float32,
device="xla").reshape(tensor_size) * (r + 1)
for r in range(tp_degree)
]
expected = torch.cat(all_tensors, dim=all_gather_dimension)
t = all_tensors[index % tp_degree]
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
torch.testing.assert_close(t, expected)
def all_reduce_test_worker(index, tp_degree, distributed_init_method):
init_distributed_environment(tp_degree,
index,
distributed_init_method,
index,
backend="xla")
ensure_model_parallel_initialized(tp_degree, 1)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
for r in range(tp_degree)
]
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[index % tp_degree]
t = tensor_model_parallel_all_reduce(t)
torch.testing.assert_close(t, expected)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker])
@reinitialize_neuron_runtime
def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
test_target):
with patch('torch_xla._XLAC._xla_runtime_is_initialized',
return_value=False):
distributed_init_method = get_distributed_init_method(
"127.0.0.1", get_open_port())
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
','.join(['1' for _ in range(tp_size)]))
xmp.spawn(test_target, args=(tp_size, distributed_init_method))
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
import shutil
import tempfile
import torch
from huggingface_hub import snapshot_download
from safetensors import safe_open
from vllm import LLM, SamplingParams
def patch_eagle_draft_with_lm_head(target_model_id: str,
draft_model_id: str) -> str:
# In NxDI, draft model checkpoint must include lm_head weights from target
# model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
# /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
# #eagle-checkpoint-compatibility
final_draft_dir = "/tmp/patched_eagle_draft"
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = snapshot_download(repo_id=target_model_id,
local_dir=os.path.join(
tmp_dir, "target"))
draft_dir = snapshot_download(repo_id=draft_model_id,
local_dir=os.path.join(tmp_dir, "draft"))
lm_head_key = "lm_head.weight"
index_path = os.path.join(target_dir, "model.safetensors.index.json")
with open(index_path) as f:
index = json.load(f)
shard_name = index["weight_map"][lm_head_key]
target_safetensor_path = os.path.join(target_dir, shard_name)
with safe_open(target_safetensor_path, framework="pt") as f:
target_lm_head = f.get_tensor(lm_head_key)
draft_path = os.path.join(draft_dir, "pytorch_model.bin")
draft_state_dict = torch.load(draft_path, map_location="cpu")
draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
torch.save(draft_state_dict, draft_path)
shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
return final_draft_dir
def test_eagle():
patched_draft_path = patch_eagle_draft_with_lm_head(
target_model_id="meta-llama/Llama-2-7b-hf",
draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
speculative_config={
"model": patched_draft_path,
"num_speculative_tokens": 5,
"max_model_len": 128
},
max_num_seqs=1,
max_model_len=128,
tensor_parallel_size=2,
override_neuron_config={
"enable_eagle_speculation": True,
"enable_fused_speculation": True,
"fused_qkv": True
},
)
prompts = [
"The president of the United States is",
]
outputs = llm.generate(prompts, SamplingParams(top_k=1))
expected_output = " the head of state and head of government of " \
"the United States. The president direct"
for output in outputs:
generated_text = output.outputs[0].text
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
assert (expected_output == generated_text)
print("Neuron Eagle speculation test passed.")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
def test_mistral():
llm = LLM(model="mistralai/Mistral-7B-v0.1",
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=128,
override_neuron_config={
"sequence_parallel_enabled": False,
"skip_warmup": True
})
# Send more prompts than the compiled batch size (4) and request
# varying generation lengths to test accuracy related to Neuron
# specific sequence id sorting.
prompts = [
"The president of the United States is",
"The capital of France is",
"What is Annapurna labs?",
"I believe the meaning of life is",
"Tell me a story about a brave knight",
"Hello, my name is Llama",
]
sampling_params = [
SamplingParams(top_k=1, max_tokens=10),
SamplingParams(top_k=1, max_tokens=20),
SamplingParams(top_k=1, max_tokens=30),
SamplingParams(top_k=1, max_tokens=40),
SamplingParams(top_k=1, max_tokens=50),
SamplingParams(top_k=1, max_tokens=60)
]
outputs = llm.generate(prompts, sampling_params)
expected_outputs = [
" the most powerful person in the world. He is",
" a city of many faces. It is a city of history, culture, art, "
"fashion, and",
"\n\nAnnapurna Labs is a semiconductor company that was founded "
"in 2013 by Amazon. The company is",
" to be happy.\n\nI believe that happiness is a choice.\n\nI "
"believe that happiness is a state of mind.\n\nI believe that "
"happiness is a journey.\n\nI believe",
" who rescued a princess from a dragon.\n\nTell me a story about"
" a princess who rescued herself from a dragon.\n\nTell me a "
"story about a princess who rescued herself from a dragon and "
"then rescued a knight from",
" and I am a 10 year old male. I am a very friendly and "
"affectionate boy who loves to be around people. I am a very "
"active boy who loves to play and run around. I am a very smart "
"boy who loves to learn new things. I am a very loyal boy"
]
for expected_output, output in zip(expected_outputs, outputs):
generated_text = output.outputs[0].text
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
assert (expected_output == generated_text)
print("Neuron Mistral test passed.")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from huggingface_hub import snapshot_download
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
def test_llama_single_lora():
sql_lora_files = snapshot_download(
repo_id="yard1/llama-2-7b-sql-lora-test")
llm = LLM(model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=512,
override_neuron_config={
"sequence_parallel_enabled": False,
"skip_warmup": True,
"lora_modules": [{
"name": "lora_id_1",
"path": sql_lora_files
}]
},
enable_lora=True,
max_loras=1,
max_lora_rank=256,
device="neuron")
"""For multi-lora requests using NxDI as the backend, only the lora_name
needs to be specified. The lora_id and lora_path are supplied at the LLM
class/server initialization, after which the paths are handled by NxDI"""
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
prompts = [
"The president of the United States is",
"The capital of France is",
]
outputs = llm.generate(prompts,
SamplingParams(top_k=1),
lora_request=[lora_req_1, lora_req_1])
expected_outputs = [
" the head of state and head of government of the United States. "
"The president direct",
" a city of contrasts. The city is home to the Eiffel Tower"
]
for expected_output, output in zip(expected_outputs, outputs):
generated_text = output.outputs[0].text
assert (expected_output == generated_text)
def test_llama_multiple_lora():
sql_lora_files = snapshot_download(
repo_id="yard1/llama-2-7b-sql-lora-test")
llm = LLM(model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=512,
override_neuron_config={
"sequence_parallel_enabled":
False,
"skip_warmup":
True,
"lora_modules": [{
"name": "lora_id_1",
"path": sql_lora_files
}, {
"name": "lora_id_2",
"path": sql_lora_files
}]
},
enable_lora=True,
max_loras=2,
max_lora_rank=256,
device="neuron")
"""For multi-lora requests using NxDI as the backend, only the lora_name
needs to be specified. The lora_id and lora_path are supplied at the LLM
class/server initialization, after which the paths are handled by NxDI"""
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
lora_req_2 = LoRARequest("lora_id_2", 1, " ")
prompts = [
"The president of the United States is",
"The capital of France is",
]
outputs = llm.generate(prompts,
SamplingParams(top_k=1),
lora_request=[lora_req_1, lora_req_2])
expected_outputs = [
" the head of state and head of government of the United States. "
"The president direct",
" a city of contrasts. The city is home to the Eiffel Tower"
]
for expected_output, output in zip(expected_outputs, outputs):
generated_text = output.outputs[0].text
assert (expected_output == generated_text)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def register_prithvi_india():
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia" # noqa: E501
def register_prithvi_valencia(): def register_prithvi():
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia" # noqa: E501 return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor" # noqa: E501
...@@ -8,7 +8,7 @@ import datetime ...@@ -8,7 +8,7 @@ import datetime
import os import os
import tempfile import tempfile
import urllib.request import urllib.request
from collections.abc import AsyncGenerator, Sequence from collections.abc import Sequence
from typing import Any, Optional, Union from typing import Any, Optional, Union
import albumentations import albumentations
...@@ -234,6 +234,8 @@ def load_image( ...@@ -234,6 +234,8 @@ def load_image(
class PrithviMultimodalDataProcessor(IOProcessor): class PrithviMultimodalDataProcessor(IOProcessor):
indices = [0, 1, 2, 3, 4, 5]
def __init__(self, vllm_config: VllmConfig): def __init__(self, vllm_config: VllmConfig):
super().__init__(vllm_config) super().__init__(vllm_config)
...@@ -359,14 +361,6 @@ class PrithviMultimodalDataProcessor(IOProcessor): ...@@ -359,14 +361,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
return prompts return prompts
async def pre_process_async(
self,
prompt: IOProcessorInput,
request_id: Optional[str] = None,
**kwargs,
) -> Union[PromptType, Sequence[PromptType]]:
return self.pre_process(prompt, request_id, **kwargs)
def post_process( def post_process(
self, self,
model_output: Sequence[PoolingRequestOutput], model_output: Sequence[PoolingRequestOutput],
...@@ -420,30 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor): ...@@ -420,30 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
format="tiff", format="tiff",
data=out_data, data=out_data,
request_id=request_id) request_id=request_id)
async def post_process_async(
self,
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
request_id: Optional[str] = None,
**kwargs,
) -> IOProcessorOutput:
collected_output = [item async for i, item in model_output]
return self.post_process(collected_output, request_id, **kwargs)
class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
def __init__(self, vllm_config: VllmConfig):
super().__init__(vllm_config)
self.indices = [1, 2, 3, 8, 11, 12]
class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
def __init__(self, vllm_config: VllmConfig):
super().__init__(vllm_config)
self.indices = [0, 1, 2, 3, 4, 5]
...@@ -9,8 +9,7 @@ setup( ...@@ -9,8 +9,7 @@ setup(
packages=["prithvi_io_processor"], packages=["prithvi_io_processor"],
entry_points={ entry_points={
"vllm.io_processor_plugins": [ "vllm.io_processor_plugins": [
"prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india", # noqa: E501 "prithvi_to_tiff = prithvi_io_processor:register_prithvi", # noqa: E501
"prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia", # noqa: E501
] ]
}, },
) )
...@@ -7,12 +7,11 @@ import requests ...@@ -7,12 +7,11 @@ import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.entrypoints.llm import LLM
from vllm.entrypoints.openai.protocol import IOProcessorResponse from vllm.entrypoints.openai.protocol import IOProcessorResponse
from vllm.plugins.io_processors import get_io_processor from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501
...@@ -23,61 +22,7 @@ def test_loading_missing_plugin(): ...@@ -23,61 +22,7 @@ def test_loading_missing_plugin():
get_io_processor(vllm_config, "wrong_plugin") get_io_processor(vllm_config, "wrong_plugin")
def test_loading_engine_with_wrong_plugin(): @pytest.fixture(scope="function")
with pytest.raises(ValueError):
LLM(
model=MODEL_NAME,
skip_tokenizer_init=True,
trust_remote_code=True,
enforce_eager=True,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs=32,
io_processor_plugin="wrong_plugin",
)
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
img_prompt = dict(
data=image_url,
data_format="url",
image_format="tiff",
out_data_format="b64_json",
)
pooling_params = PoolingParams(task="encode", softmax=False)
with vllm_runner(
model_name,
runner="pooling",
skip_tokenizer_init=True,
trust_remote_code=True,
enforce_eager=True,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs=1,
io_processor_plugin="prithvi_to_tiff_valencia",
) as llm_runner:
pooler_output = llm_runner.get_llm().encode(
img_prompt,
pooling_params=pooling_params,
)
output = pooler_output[0].outputs
# verify the output is formatted as expected for this plugin
assert all(
hasattr(output, attr)
for attr in ["type", "format", "data", "request_id"])
# We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted.
base64.b64decode(output.data)
@pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--runner", "--runner",
...@@ -90,7 +35,9 @@ def server(): ...@@ -90,7 +35,9 @@ def server():
"--max-num-seqs", "--max-num-seqs",
"32", "32",
"--io-processor-plugin", "--io-processor-plugin",
"prithvi_to_tiff_valencia" "prithvi_to_tiff",
"--model-impl",
"terratorch",
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -113,6 +60,7 @@ async def test_prithvi_mae_plugin_online( ...@@ -113,6 +60,7 @@ async def test_prithvi_mae_plugin_online(
}, },
"priority": 0, "priority": 0,
"model": model_name, "model": model_name,
"softmax": False
} }
ret = requests.post( ret = requests.post(
...@@ -135,3 +83,43 @@ async def test_prithvi_mae_plugin_online( ...@@ -135,3 +83,43 @@ async def test_prithvi_mae_plugin_online(
# We just check that the output is a valid base64 string. # We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted. # Raises an exception and fails the test if the string is corrupted.
base64.b64decode(plugin_data["data"]) base64.b64decode(plugin_data["data"])
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
img_prompt = dict(
data=image_url,
data_format="url",
image_format="tiff",
out_data_format="b64_json",
)
pooling_params = PoolingParams(task="encode", softmax=False)
with vllm_runner(
model_name,
runner="pooling",
skip_tokenizer_init=True,
trust_remote_code=True,
enforce_eager=True,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs=1,
model_impl="terratorch",
io_processor_plugin="prithvi_to_tiff",
) as llm_runner:
pooler_output = llm_runner.get_llm().encode(
img_prompt,
pooling_params=pooling_params,
)
output = pooler_output[0].outputs
# verify the output is formatted as expected for this plugin
assert all(
hasattr(output, attr)
for attr in ["type", "format", "data", "request_id"])
# We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted.
base64.b64decode(output.data)
...@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch): ...@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch):
reason="ModelOpt FP8 is not supported on this GPU type.") reason="ModelOpt FP8 is not supported on this GPU type.")
def test_modelopt_fp8_checkpoint_setup(vllm_runner): def test_modelopt_fp8_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8 checkpoint loading and structure validation.""" """Test ModelOpt FP8 checkpoint loading and structure validation."""
# TODO: provide a small publically available test checkpoint # TODO: provide a small publicly available test checkpoint
model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
"TinyLlama-1.1B-Chat-v1.0-fp8-0710") "TinyLlama-1.1B-Chat-v1.0-fp8-0710")
......
...@@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): ...@@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
print(output) print(output)
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
@pytest.mark.skip(
reason="since torchao nightly is only compatible with torch nightly"
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
"torchao tests that requires newer versions (0.14.0.dev+) for now")
def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
torch._dynamo.reset()
model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
"-0.14.0.dev")
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
assert output
print(output)
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__]) pytest.main([__file__])
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import LoadConfig from vllm.config.load import LoadConfig
from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader import get_model_loader
load_format = "runai_streamer" load_format = "runai_streamer"
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import os
import tempfile
import huggingface_hub.constants
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf)
from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
list_safetensors)
def test_is_runai_obj_uri():
assert is_runai_obj_uri("gs://some-gcs-bucket/path")
assert is_runai_obj_uri("s3://some-s3-bucket/path")
assert not is_runai_obj_uri("nfs://some-nfs-path")
def test_runai_list_safetensors_local():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2",
allow_patterns=["*.safetensors", "*.json"],
cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
assert len(safetensors) > 0
parentdir = [
os.path.dirname(safetensor) for safetensor in safetensors
][0]
files = list_safetensors(parentdir)
assert len(safetensors) == len(files)
if __name__ == "__main__":
test_is_runai_obj_uri()
test_runai_list_safetensors_local()
...@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit( ...@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit(
beam_width: int, beam_width: int,
) -> None: ) -> None:
# example_prompts[1]&[3]&[7] fails due to unknown reason even without # example_prompts[1]&[3]&[7] fails due to unknown reason even without
# concurency limit. skip them for now. # concurrency limit. skip them for now.
example_prompts = (example_prompts[:8]) example_prompts = (example_prompts[:8])
concurrency_limit = 2 concurrency_limit = 2
assert len(example_prompts) > concurrency_limit assert len(example_prompts) > concurrency_limit
......
...@@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref): ...@@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
model = vllm_runner( model = vllm_runner(
model_ref, model_ref,
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
pytest.fail("Expected RuntimeError for extra config keys")
except RuntimeError: except RuntimeError:
out, err = capfd.readouterr() out, err = capfd.readouterr()
combined_output = out + err combined_output = out + err
assert ("ValueError: Model loader extra config " assert ("ValueError: Unexpected extra config keys for load "
"is not supported for load "
"format auto") in combined_output "format auto") in combined_output
finally: finally:
del model del model
...@@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, ...@@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
model_ref, model_ref,
load_format="safetensors", load_format="safetensors",
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
pytest.fail("Expected RuntimeError for extra config keys")
except RuntimeError: except RuntimeError:
out, err = capfd.readouterr() out, err = capfd.readouterr()
combined_output = out + err combined_output = out + err
assert ("ValueError: Model loader extra config is not supported " assert ("ValueError: Unexpected extra config keys "
"for load format safetensors") in combined_output "for load format safetensors") in combined_output
finally: finally:
del model del model
......
...@@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field ...@@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
import pytest import pytest
from vllm.compilation.backends import VllmBackend from vllm.compilation.backends import VllmBackend
from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig, from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
get_field, update_config) update_config)
from vllm.config.load import LoadConfig
from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
from openai_harmony import (Conversation, DeveloperContent,
HarmonyEncodingName, Message, Role, SystemContent,
load_harmony_encoding)
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
from vllm.transformers_utils.tokenizer import get_tokenizer
MODEL = "gpt2"
@pytest.fixture(scope="module")
def openai_tokenizer():
# The parser does not use the tokenizer, but the constructor requires it.
return get_tokenizer(MODEL)
@pytest.fixture
def openai_tool_parser(openai_tokenizer):
return OpenAIToolParser(openai_tokenizer)
@pytest.fixture(scope="module")
def harmony_encoding():
return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
def assert_tool_calls(
actual_tool_calls: list[ToolCall],
expected_tool_calls: list[ToolCall],
):
assert len(actual_tool_calls) == len(expected_tool_calls)
for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
expected_tool_calls):
assert isinstance(actual_tool_call.id, str)
assert len(actual_tool_call.id) > 16 # Default from protocol.py
assert actual_tool_call.type == "function"
assert actual_tool_call.function == expected_tool_call.function
def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
convo = Conversation.from_messages([
Message.from_role_and_content(
Role.SYSTEM,
SystemContent.new(),
),
Message.from_role_and_content(
Role.DEVELOPER,
DeveloperContent.new().with_instructions("Talk like a pirate!")),
Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
Message.from_role_and_content(Role.ASSISTANT,
"This is a test").with_channel("final")
])
token_ids = harmony_encoding.render_conversation_for_completion(
convo, Role.ASSISTANT)
extracted_info = openai_tool_parser.extract_tool_calls(
"",
request=None,
token_ids=token_ids,
)
assert not extracted_info.tools_called
assert extracted_info.tool_calls == []
assert extracted_info.content == "This is a test"
def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding):
convo = Conversation.from_messages([
Message.from_role_and_content(Role.USER,
"What is the weather in Tokyo?"),
Message.from_role_and_content(
Role.ASSISTANT,
'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.', # noqa: E501
).with_channel("analysis"),
Message.from_role_and_content(
Role.ASSISTANT,
'{"location": "Tokyo"}').with_channel("commentary").with_recipient(
"functions.get_current_weather").with_content_type("json"),
])
token_ids = harmony_encoding.render_conversation_for_completion(
convo, Role.ASSISTANT)
extracted_info = openai_tool_parser.extract_tool_calls(
"",
request=None,
token_ids=token_ids,
)
assert extracted_info.tools_called
expected_tool_calls = [
ToolCall(function=FunctionCall(
name="get_current_weather",
arguments=json.dumps({"location": "Tokyo"}),
))
]
assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
assert extracted_info.content is None
def test_extract_tool_calls_multiple_tools(
openai_tool_parser,
harmony_encoding,
):
convo = Conversation.from_messages([
Message.from_role_and_content(
Role.USER, "What is the weather in Tokyo based on where I'm at?"),
Message.from_role_and_content(
Role.ASSISTANT,
'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.', # noqa: E501
).with_channel("analysis"),
Message.from_role_and_content(
Role.ASSISTANT,
'{"location": "Tokyo"}').with_channel("commentary").with_recipient(
"functions.get_current_weather").with_content_type("json"),
Message.from_role_and_content(
Role.ASSISTANT,
'{"location": "Tokyo"}').with_channel("commentary").with_recipient(
"functions.get_user_location").with_content_type("json"),
])
token_ids = harmony_encoding.render_conversation_for_completion(
convo,
Role.ASSISTANT,
)
extracted_info = openai_tool_parser.extract_tool_calls(
"",
request=None,
token_ids=token_ids,
)
assert extracted_info.tools_called
expected_tool_calls = [
ToolCall(function=FunctionCall(
name="get_current_weather",
arguments=json.dumps({"location": "Tokyo"}),
)),
ToolCall(function=FunctionCall(
name="get_user_location",
arguments=json.dumps({"location": "Tokyo"}),
))
]
assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
assert extracted_info.content is None
...@@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [ ...@@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
expected_value=0.76), # no bias expected_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU, # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As # so only one of these tests can run in a single call to pytest. As
# a follow up, move this into the LM-EVAL section of the CI. # a follow-up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig( # GSM8KAccuracyTestConfig(
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8", # model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
# expected_value=0.66), # bias in QKV layers # expected_value=0.66), # bias in QKV layers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment