"tests/mxnet/test_function.py" did not exist on "3e43d7b8d203df1f2e2e2f0b5c029dfebeec549b"
Unverified Commit 8bee20f8 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

Update vllm to 0.6.3 (#1711) (#1720)


Co-authored-by: default avatarKe Bao <ISPObaoke@163.com>
parent 12cad0fe
...@@ -208,7 +208,7 @@ if __name__ == "__main__": ...@@ -208,7 +208,7 @@ if __name__ == "__main__":
model_override_args["image_token_index"] = 64002 model_override_args["image_token_index"] = 64002
if args.num_frames == 32: if args.num_frames == 32:
model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
model_override_args["max_sequence_length"] = 4096 * 2 model_override_args["max_sequence_length"] = 4096 * 2
model_override_args["tokenizer_model_max_length"] = 4096 * 2 model_override_args["tokenizer_model_max_length"] = 4096 * 2
elif args.num_frames < 32: elif args.num_frames < 32:
......
...@@ -26,7 +26,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu ...@@ -26,7 +26,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
"outlines>=0.0.44", "modelscope"] "outlines>=0.0.44", "modelscope"]
# xpu is not enabled in public vllm and torch whl, # xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"] srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
srt_xpu = ["sglang[runtime_common]"] srt_xpu = ["sglang[runtime_common]"]
openai = ["openai>=1.0", "tiktoken"] openai = ["openai>=1.0", "tiktoken"]
......
...@@ -14,7 +14,7 @@ if __name__ == "__main__": ...@@ -14,7 +14,7 @@ if __name__ == "__main__":
model_override_args["num_frames"] = 16 model_override_args["num_frames"] = 16
model_override_args["model_type"] = "llavavid" model_override_args["model_type"] = "llavavid"
if model_override_args["num_frames"] == 32: if model_override_args["num_frames"] == 32:
model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
model_override_args["max_sequence_length"] = 4096 * 2 model_override_args["max_sequence_length"] = 4096 * 2
model_override_args["tokenizer_model_max_length"] = 4096 * 2 model_override_args["tokenizer_model_max_length"] = 4096 * 2
model_override_args["model_max_length"] = 4096 * 2 model_override_args["model_max_length"] = 4096 * 2
......
...@@ -20,8 +20,10 @@ from vllm.distributed import ( ...@@ -20,8 +20,10 @@ from vllm.distributed import (
from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.parameter import ( from vllm.model_executor.parameter import (
BasevLLMParameter, BasevLLMParameter,
PackedColumnParameter,
PackedvLLMParameter, PackedvLLMParameter,
PerTensorScaleParameter, PerTensorScaleParameter,
RowvLLMParameter,
) )
from sglang.srt.layers.quantization.base_config import ( from sglang.srt.layers.quantization.base_config import (
...@@ -39,6 +41,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [ ...@@ -39,6 +41,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
"GPTQMarlinLinearMethod", "GPTQMarlinLinearMethod",
"Fp8LinearMethod", "Fp8LinearMethod",
"MarlinLinearMethod", "MarlinLinearMethod",
"GPTQLinearMethod",
] ]
...@@ -50,7 +53,7 @@ def adjust_marlin_shard(param, shard_size, shard_offset): ...@@ -50,7 +53,7 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
return shard_size * marlin_tile_size, shard_offset * marlin_tile_size return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
def adjust_bitsandbytes_shard( def adjust_bitsandbytes_4bit_shard(
param: Parameter, qkv_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str param: Parameter, qkv_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str
) -> Tuple[int, int]: ) -> Tuple[int, int]:
"""Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
...@@ -207,7 +210,6 @@ class ReplicatedLinear(LinearBase): ...@@ -207,7 +210,6 @@ class ReplicatedLinear(LinearBase):
self.output_size, self.output_size,
self.params_dtype, self.params_dtype,
weight_loader=self.weight_loader, weight_loader=self.weight_loader,
prefix=prefix,
) )
if bias: if bias:
...@@ -315,7 +317,6 @@ class ColumnParallelLinear(LinearBase): ...@@ -315,7 +317,6 @@ class ColumnParallelLinear(LinearBase):
if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
else self.weight_loader else self.weight_loader
), ),
prefix=prefix,
) )
if bias: if bias:
self.bias = Parameter( self.bias = Parameter(
...@@ -345,8 +346,12 @@ class ColumnParallelLinear(LinearBase): ...@@ -345,8 +346,12 @@ class ColumnParallelLinear(LinearBase):
if is_gguf_weight and isinstance(param, UninitializedParameter): if is_gguf_weight and isinstance(param, UninitializedParameter):
param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
param_data = param.data param_data = param.data
if output_dim is not None: # bitsandbytes loads the weights of the specific portion
# no need to narrow here
if output_dim is not None and not use_bitsandbytes_4bit:
shard_size = param_data.shape[output_dim] shard_size = param_data.shape[output_dim]
start_idx = tp_rank * shard_size start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
...@@ -454,17 +459,22 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -454,17 +459,22 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
param.shard_weight_type[loaded_shard_id] = loaded_weight.item() param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
return return
if is_gguf_weight and isinstance(param, UninitializedParameter): if is_gguf_weight:
from gguf.constants import GGML_QUANT_SIZES tp_size = get_tensor_model_parallel_world_size()
tp_rank = get_tensor_model_parallel_rank()
output_dim = getattr(param, "output_dim", None)
shard_size = loaded_weight.size(output_dim) // tp_size
start_idx = tp_rank * shard_size
ori_shape = param.tensor_shape loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
weight_types = self.qweight_type.shard_weight_type.values()
row_size = [] param.shard_id.append(loaded_shard_id)
for weight_type in weight_types: param.shard_id_map[loaded_shard_id] = len(param.data_container)
block_size, type_size = GGML_QUANT_SIZES[weight_type] param.data_container.append(loaded_weight)
row_size.append(ori_shape[1] // block_size * type_size) if len(param.data_container) == 2:
q_shape = (ori_shape[0], max(row_size)) self.qweight = param.materialize_nested()
param.materialize(q_shape, dtype=loaded_weight.dtype) return
param_data = param.data param_data = param.data
output_dim = getattr(param, "output_dim", None) output_dim = getattr(param, "output_dim", None)
...@@ -526,26 +536,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -526,26 +536,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
param, shard_size, shard_offset param, shard_size, shard_offset
) )
use_bitsandbytes = getattr(param, "use_bitsandbytes", False) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
if use_bitsandbytes: if use_bitsandbytes_4bit:
shard_size = loaded_weight.shape[output_dim] shard_size = loaded_weight.shape[output_dim]
shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
if is_gguf_weight:
tp_size = get_tensor_model_parallel_world_size()
output_dim = getattr(param, "output_dim", None)
shard_shape = list(loaded_weight.shape)
shard_shape[output_dim] = shard_shape[output_dim] // tp_size
param.shard_id.append(loaded_shard_id)
param.shard_size[loaded_shard_id] = shard_shape
input_dim = getattr(param, "input_dim", None)
input_size = loaded_weight.shape[input_dim]
param_data = param_data.narrow(input_dim, 0, input_size)
param_data = param_data.narrow(output_dim, shard_offset, shard_size) param_data = param_data.narrow(output_dim, shard_offset, shard_size)
start_idx = tp_rank * shard_size start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # bitsandbytes loads the weights of the specific portion
# no need to narrow here
if not use_bitsandbytes_4bit:
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
# Special case for AQLM codebooks. # Special case for AQLM codebooks.
elif is_metadata: elif is_metadata:
# metadata indicates fixed size concatenated along dim 0 # metadata indicates fixed size concatenated along dim 0
...@@ -595,7 +596,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -595,7 +596,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
# If quantized, we need to adjust the offset and size to account # If quantized, we need to adjust the offset and size to account
# for the packing. # for the packing.
if ( if (
isinstance(param, PackedvLLMParameter) isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
and param.packed_dim == param.output_dim and param.packed_dim == param.output_dim
): ):
shard_size, shard_offset = param.adjust_shard_indexes_for_packing( shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
...@@ -617,7 +618,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -617,7 +618,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
if isinstance(param, PerTensorScaleParameter): if isinstance(param, PerTensorScaleParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0) param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0)
return return
elif type(param) is BasevLLMParameter: elif type(param) in (RowvLLMParameter, BasevLLMParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight) param.load_merged_column_weight(loaded_weight=loaded_weight)
return return
self._load_fused_module_from_checkpoint(param, loaded_weight) self._load_fused_module_from_checkpoint(param, loaded_weight)
...@@ -760,7 +761,7 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -760,7 +761,7 @@ class QKVParallelLinear(ColumnParallelLinear):
# If quantized, we need to adjust the offset and size to account # If quantized, we need to adjust the offset and size to account
# for the packing. # for the packing.
if ( if (
isinstance(param, PackedvLLMParameter) isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
and param.packed_dim == param.output_dim and param.packed_dim == param.output_dim
): ):
shard_size, shard_offset = param.adjust_shard_indexes_for_packing( shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
...@@ -780,10 +781,10 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -780,10 +781,10 @@ class QKVParallelLinear(ColumnParallelLinear):
): ):
if loaded_shard_id is None: # special case for certain models if loaded_shard_id is None: # special case for certain models
if isinstance(param, PerTensorScaleParameter): if isinstance(param, PerTensorScaleParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0) param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
return return
elif type(param) is BasevLLMParameter: elif type(param) in (RowvLLMParameter, BasevLLMParameter):
param.load_merged_column_weight(loaded_weight=loaded_weight) param.load_qkv_weight(loaded_weight=loaded_weight)
return return
self._load_fused_module_from_checkpoint(param, loaded_weight) self._load_fused_module_from_checkpoint(param, loaded_weight)
return return
...@@ -818,17 +819,22 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -818,17 +819,22 @@ class QKVParallelLinear(ColumnParallelLinear):
param.shard_weight_type[loaded_shard_id] = loaded_weight.item() param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
return return
if is_gguf_weight and isinstance(param, UninitializedParameter): if is_gguf_weight:
from gguf.constants import GGML_QUANT_SIZES tp_size = get_tensor_model_parallel_world_size()
tp_rank = get_tensor_model_parallel_rank()
output_dim = getattr(param, "output_dim", None)
shard_size = loaded_weight.size(output_dim) // tp_size
start_idx = tp_rank * shard_size
ori_shape = param.tensor_shape loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
weight_types = self.qweight_type.shard_weight_type.values()
row_size = [] param.shard_id.append(loaded_shard_id)
for weight_type in weight_types: param.shard_id_map[loaded_shard_id] = len(param.data_container)
block_size, type_size = GGML_QUANT_SIZES[weight_type] param.data_container.append(loaded_weight)
row_size.append(ori_shape[1] // block_size * type_size) if len(param.data_container) == 3:
q_shape = (ori_shape[0], max(row_size)) self.qweight = param.materialize_nested()
param.materialize(q_shape, dtype=loaded_weight.dtype) return
param_data = param.data param_data = param.data
output_dim = getattr(param, "output_dim", None) output_dim = getattr(param, "output_dim", None)
...@@ -863,6 +869,8 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -863,6 +869,8 @@ class QKVParallelLinear(ColumnParallelLinear):
self.total_num_kv_heads * self.head_size, self.total_num_kv_heads * self.head_size,
), ),
] ]
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
packed_dim = getattr(param, "packed_dim", None) packed_dim = getattr(param, "packed_dim", None)
for shard_id, shard_offset, shard_size in shard_offsets: for shard_id, shard_offset, shard_size in shard_offsets:
# Special case for Quantized Weights. # Special case for Quantized Weights.
...@@ -877,6 +885,29 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -877,6 +885,29 @@ class QKVParallelLinear(ColumnParallelLinear):
param, shard_size, shard_offset param, shard_size, shard_offset
) )
if use_bitsandbytes_4bit:
orig_qkv_offsets = {
"q": (0, self.total_num_heads * self.head_size),
"k": (
self.total_num_heads * self.head_size,
self.total_num_kv_heads * self.head_size,
),
"v": (
(self.total_num_heads + self.total_num_kv_heads)
* self.head_size,
self.total_num_kv_heads * self.head_size,
),
"total": (
(self.total_num_heads + 2 * self.total_num_kv_heads)
* self.head_size,
0,
),
}
shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
param, orig_qkv_offsets, shard_id
)
loaded_weight_shard = loaded_weight.narrow( loaded_weight_shard = loaded_weight.narrow(
output_dim, shard_offset, shard_size output_dim, shard_offset, shard_size
) )
...@@ -910,8 +941,8 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -910,8 +941,8 @@ class QKVParallelLinear(ColumnParallelLinear):
param, shard_size, shard_offset param, shard_size, shard_offset
) )
use_bitsandbytes = getattr(param, "use_bitsandbytes", False) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
if use_bitsandbytes: if use_bitsandbytes_4bit:
orig_qkv_offsets = { orig_qkv_offsets = {
"q": (0, self.num_heads * self.head_size), "q": (0, self.num_heads * self.head_size),
"k": ( "k": (
...@@ -927,29 +958,22 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -927,29 +958,22 @@ class QKVParallelLinear(ColumnParallelLinear):
0, 0,
), ),
} }
shard_size, shard_offset = adjust_bitsandbytes_shard( shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
param, orig_qkv_offsets, loaded_shard_id param, orig_qkv_offsets, loaded_shard_id
) )
if is_gguf_weight:
tp_size = get_tensor_model_parallel_world_size()
output_dim = getattr(param, "output_dim", None)
shard_shape = list(loaded_weight.shape)
shard_shape[output_dim] = shard_shape[output_dim] // tp_size
param.shard_id.append(loaded_shard_id)
param.shard_size[loaded_shard_id] = shard_shape
input_dim = getattr(param, "input_dim", None)
input_size = loaded_weight.shape[input_dim]
param_data = param_data.narrow(input_dim, 0, input_size)
param_data = param_data.narrow(output_dim, shard_offset, shard_size) param_data = param_data.narrow(output_dim, shard_offset, shard_size)
if loaded_shard_id == "q": if loaded_shard_id == "q":
shard_id = tp_rank shard_id = tp_rank
else: else:
shard_id = tp_rank // self.num_kv_head_replicas shard_id = tp_rank // self.num_kv_head_replicas
start_idx = shard_id * shard_size start_idx = shard_id * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
# bitsandbytes loads the weights of the specific portion
# no need to narrow here
if not use_bitsandbytes_4bit:
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
# Special case for for AQLM codebooks. # Special case for for AQLM codebooks.
elif is_metadata: elif is_metadata:
# metadata indicates fixed size concatenated along dim 0 # metadata indicates fixed size concatenated along dim 0
...@@ -1037,7 +1061,6 @@ class RowParallelLinear(LinearBase): ...@@ -1037,7 +1061,6 @@ class RowParallelLinear(LinearBase):
if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
else self.weight_loader else self.weight_loader
), ),
prefix=prefix,
) )
if not reduce_results and (bias and not skip_bias_add): if not reduce_results and (bias and not skip_bias_add):
raise ValueError( raise ValueError(
...@@ -1061,6 +1084,7 @@ class RowParallelLinear(LinearBase): ...@@ -1061,6 +1084,7 @@ class RowParallelLinear(LinearBase):
tp_rank = get_tensor_model_parallel_rank() tp_rank = get_tensor_model_parallel_rank()
tp_size = get_tensor_model_parallel_world_size() tp_size = get_tensor_model_parallel_world_size()
input_dim = getattr(param, "input_dim", None) input_dim = getattr(param, "input_dim", None)
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
# Special case for GGUF # Special case for GGUF
is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight = getattr(param, "is_gguf_weight", False)
...@@ -1076,7 +1100,9 @@ class RowParallelLinear(LinearBase): ...@@ -1076,7 +1100,9 @@ class RowParallelLinear(LinearBase):
param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
param_data = param.data param_data = param.data
if input_dim is not None: # bitsandbytes loads the weights of the specific portion
# no need to narrow here
if input_dim is not None and not use_bitsandbytes_4bit:
shard_size = param_data.shape[input_dim] shard_size = param_data.shape[input_dim]
start_idx = tp_rank * shard_size start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
......
...@@ -351,7 +351,9 @@ class LoRAAdapter(nn.Module): ...@@ -351,7 +351,9 @@ class LoRAAdapter(nn.Module):
loader = DefaultModelLoader(self.load_config) loader = DefaultModelLoader(self.load_config)
revision = getattr(self.config.hf_config, "revision", None) revision = getattr(self.config.hf_config, "revision", None)
for name, loaded_weight in loader._get_weights_iterator( for name, loaded_weight in loader._get_weights_iterator(
model_path, revision=revision, fall_back_to_pt=True DefaultModelLoader.Source(
model_path, revision=revision, fall_back_to_pt=True
)
): ):
match = re.search(r"layers\.(\d+)\.", name) match = re.search(r"layers\.(\d+)\.", name)
if match is not None: if match is not None:
......
...@@ -59,8 +59,11 @@ from sglang.srt.server_args import ServerArgs ...@@ -59,8 +59,11 @@ from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import ( from sglang.srt.utils import (
enable_show_time_cost, enable_show_time_cost,
get_available_gpu_memory, get_available_gpu_memory,
is_attention_free_model,
is_embedding_model,
is_generation_model, is_generation_model,
is_multimodal_model, is_multimodal_model,
model_has_inner_state,
monkey_patch_vllm_dummy_weight_loader, monkey_patch_vllm_dummy_weight_loader,
monkey_patch_vllm_p2p_access_check, monkey_patch_vllm_p2p_access_check,
) )
...@@ -316,11 +319,13 @@ class ModelRunner: ...@@ -316,11 +319,13 @@ class ModelRunner:
def get_weight_iter(config): def get_weight_iter(config):
iter = loader._get_weights_iterator( iter = loader._get_weights_iterator(
config.model, DefaultModelLoader.Source(
config.revision, config.model,
fall_back_to_pt=getattr( revision=config.revision,
self.model, "fall_back_to_pt_during_load", True fall_back_to_pt=getattr(
), self.model, "fall_back_to_pt_during_load", True
),
)
) )
return iter return iter
...@@ -662,3 +667,7 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]: ...@@ -662,3 +667,7 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
# Monkey patch model loader # Monkey patch model loader
setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt) setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt)
setattr(ModelRegistry, "is_multimodal_model", is_multimodal_model)
setattr(ModelRegistry, "is_attention_free_model", is_attention_free_model)
setattr(ModelRegistry, "model_has_inner_state", model_has_inner_state)
setattr(ModelRegistry, "is_embedding_model", is_embedding_model)
...@@ -250,7 +250,7 @@ class DeepseekV2Attention(nn.Module): ...@@ -250,7 +250,7 @@ class DeepseekV2Attention(nn.Module):
bias=False, bias=False,
quant_config=quant_config, quant_config=quant_config,
) )
rope_scaling["type"] = "deepseek_yarn" rope_scaling["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
qk_rope_head_dim, qk_rope_head_dim,
rotary_dim=qk_rope_head_dim, rotary_dim=qk_rope_head_dim,
...@@ -398,7 +398,7 @@ class DeepseekV2AttentionMLA(nn.Module): ...@@ -398,7 +398,7 @@ class DeepseekV2AttentionMLA(nn.Module):
bias=False, bias=False,
quant_config=quant_config, quant_config=quant_config,
) )
rope_scaling["type"] = "deepseek_yarn" rope_scaling["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope( self.rotary_emb = get_rope(
qk_rope_head_dim, qk_rope_head_dim,
rotary_dim=qk_rope_head_dim, rotary_dim=qk_rope_head_dim,
......
...@@ -215,6 +215,26 @@ def is_multimodal_model(model_architectures): ...@@ -215,6 +215,26 @@ def is_multimodal_model(model_architectures):
return False return False
def is_attention_free_model(model_architectures):
return False
def model_has_inner_state(model_architectures):
return False
def is_embedding_model(model_architectures):
if (
"LlamaEmbeddingModel" in model_architectures
or "MistralModel" in model_architectures
or "LlamaForSequenceClassification" in model_architectures
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
):
return True
else:
return False
def is_generation_model(model_architectures, is_embedding: bool = False): def is_generation_model(model_architectures, is_embedding: bool = False):
# We have two ways to determine whether a model is a generative model. # We have two ways to determine whether a model is a generative model.
# 1. Check the model architectue # 1. Check the model architectue
......
...@@ -11,13 +11,13 @@ class TestPrepareServerArgs(unittest.TestCase): ...@@ -11,13 +11,13 @@ class TestPrepareServerArgs(unittest.TestCase):
"--model-path", "--model-path",
"model_path", "model_path",
"--json-model-override-args", "--json-model-override-args",
'{"rope_scaling": {"factor": 2.0, "type": "linear"}}', '{"rope_scaling": {"factor": 2.0, "rope_type": "linear"}}',
] ]
) )
self.assertEqual(server_args.model_path, "model_path") self.assertEqual(server_args.model_path, "model_path")
self.assertEqual( self.assertEqual(
json.loads(server_args.json_model_override_args), json.loads(server_args.json_model_override_args),
{"rope_scaling": {"factor": 2.0, "type": "linear"}}, {"rope_scaling": {"factor": 2.0, "rope_type": "linear"}},
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment