Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5dc54f1a
Unverified
Commit
5dc54f1a
authored
Jan 17, 2025
by
Yineng Zhang
Committed by
GitHub
Jan 17, 2025
Browse files
feat: remove vllm distributed (#2907)
Co-authored-by:
Zhangyi
<
1109276519@qq.com
>
parent
f3e9b489
Changes
45
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
60 additions
and
52 deletions
+60
-52
python/sglang/srt/layers/activation.py
python/sglang/srt/layers/activation.py
+3
-3
python/sglang/srt/layers/dp_attention.py
python/sglang/srt/layers/dp_attention.py
+2
-1
python/sglang/srt/layers/linear.py
python/sglang/srt/layers/linear.py
+2
-2
python/sglang/srt/layers/logits_processor.py
python/sglang/srt/layers/logits_processor.py
+2
-2
python/sglang/srt/layers/moe/ep_moe/layer.py
python/sglang/srt/layers/moe/ep_moe/layer.py
+4
-4
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+3
-3
python/sglang/srt/layers/parameter.py
python/sglang/srt/layers/parameter.py
+2
-1
python/sglang/srt/layers/quantization/fp8.py
python/sglang/srt/layers/quantization/fp8.py
+1
-1
python/sglang/srt/layers/vocab_parallel_embedding.py
python/sglang/srt/layers/vocab_parallel_embedding.py
+2
-2
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+2
-2
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+9
-5
python/sglang/srt/model_loader/loader.py
python/sglang/srt/model_loader/loader.py
+8
-6
python/sglang/srt/model_loader/weight_utils.py
python/sglang/srt/model_loader/weight_utils.py
+1
-1
python/sglang/srt/models/baichuan.py
python/sglang/srt/models/baichuan.py
+4
-4
python/sglang/srt/models/chatglm.py
python/sglang/srt/models/chatglm.py
+1
-1
python/sglang/srt/models/commandr.py
python/sglang/srt/models/commandr.py
+3
-3
python/sglang/srt/models/dbrx.py
python/sglang/srt/models/dbrx.py
+4
-4
python/sglang/srt/models/deepseek.py
python/sglang/srt/models/deepseek.py
+3
-3
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+3
-3
python/sglang/srt/models/exaone.py
python/sglang/srt/models/exaone.py
+1
-1
No files found.
python/sglang/srt/layers/activation.py
View file @
5dc54f1a
...
...
@@ -25,13 +25,13 @@ from sglang.srt.utils import is_flashinfer_available
if
is_flashinfer_available
():
from
flashinfer.activation
import
gelu_and_mul
,
gelu_tanh_and_mul
,
silu_and_mul
from
vllm.distributed
import
(
from
vllm.model_executor.custom_op
import
CustomOp
from
sglang.srt.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.custom_op
import
CustomOp
from
sglang.srt.layers.custom_op_util
import
register_custom_op
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.utils
import
set_weight_attrs
...
...
python/sglang/srt/layers/dp_attention.py
View file @
5dc54f1a
import
torch
from
vllm.distributed
import
GroupCoordinator
,
get_tp_group
from
sglang.srt.distributed
import
GroupCoordinator
,
get_tp_group
_ATTN_TP_GROUP
=
None
_ATTN_TP_RANK
=
None
...
...
python/sglang/srt/layers/linear.py
View file @
5dc54f1a
...
...
@@ -7,7 +7,8 @@ from typing import Dict, List, Optional, Tuple
import
torch
import
torch.nn.functional
as
F
from
torch.nn.parameter
import
Parameter
,
UninitializedParameter
from
vllm.distributed
import
(
from
sglang.srt.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
...
...
@@ -15,7 +16,6 @@ from vllm.distributed import (
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
,
)
from
sglang.srt.layers.parameter
import
(
BasevLLMParameter
,
PackedColumnParameter
,
...
...
python/sglang/srt/layers/logits_processor.py
View file @
5dc54f1a
...
...
@@ -20,11 +20,11 @@ import torch
import
triton
import
triton.language
as
tl
from
torch
import
nn
from
vllm.distributed
import
(
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_gather
,
)
from
sglang.srt.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
sglang.srt.model_executor.forward_batch_info
import
(
CaptureHiddenMode
,
...
...
python/sglang/srt/layers/moe/ep_moe/layer.py
View file @
5dc54f1a
...
...
@@ -4,13 +4,13 @@ from typing import Callable, List, Optional, Tuple
import
torch
from
torch.nn
import
Module
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.quantization.fp8
import
Fp8Config
,
Fp8MoEMethod
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
sglang.srt.layers.custom_op_util
import
register_custom_op
from
sglang.srt.layers.moe.ep_moe.kernels
import
(
grouped_gemm_triton
,
...
...
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
View file @
5dc54f1a
...
...
@@ -5,13 +5,13 @@ from enum import Enum
from
typing
import
Callable
,
List
,
Optional
,
Tuple
import
torch
from
vllm.distributed
import
(
from
vllm.model_executor.custom_op
import
CustomOp
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
,
)
from
vllm.model_executor.custom_op
import
CustomOp
from
sglang.srt.layers.custom_op_util
import
register_custom_op
from
sglang.srt.layers.moe.fused_moe_native
import
moe_forward_native
from
sglang.srt.layers.moe.topk
import
select_experts
...
...
python/sglang/srt/layers/parameter.py
View file @
5dc54f1a
...
...
@@ -6,7 +6,8 @@ from typing import Callable, Optional, Union
import
torch
from
torch.nn
import
Parameter
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
__all__
=
[
"BasevLLMParameter"
,
...
...
python/sglang/srt/layers/quantization/fp8.py
View file @
5dc54f1a
...
...
@@ -8,7 +8,6 @@ import torch.nn.functional as F
from
torch.nn
import
Module
from
torch.nn.parameter
import
Parameter
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.quantization.kv_cache
import
BaseKVCacheMethod
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
apply_fp8_marlin_linear
,
...
...
@@ -24,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
requantize_with_max_scale
,
)
from
sglang.srt.distributed
import
get_tensor_model_parallel_world_size
from
sglang.srt.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
...
...
python/sglang/srt/layers/vocab_parallel_embedding.py
View file @
5dc54f1a
...
...
@@ -6,13 +6,13 @@ from typing import List, Optional, Sequence, Tuple
import
torch
import
torch.nn.functional
as
F
from
torch.nn.parameter
import
Parameter
,
UninitializedParameter
from
vllm.distributed
import
(
from
sglang.srt.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
,
)
from
sglang.srt.layers.parameter
import
BasevLLMParameter
from
sglang.srt.layers.quantization.base_config
import
(
QuantizationConfig
,
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
5dc54f1a
...
...
@@ -21,10 +21,10 @@ from typing import TYPE_CHECKING, Callable
import
torch
import
tqdm
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
vllm.distributed.parallel_state
import
graph_capture
from
vllm.model_executor.custom_op
import
CustomOp
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed.parallel_state
import
graph_capture
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
from
sglang.srt.layers.moe.fused_moe_native
import
fused_moe_forward_native
from
sglang.srt.layers.torchao_utils
import
save_gemlite_cache
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
5dc54f1a
...
...
@@ -21,16 +21,17 @@ from typing import List, Optional, Tuple
import
torch
import
torch.distributed
as
dist
from
vllm.distributed
import
(
from
sglang.srt.configs.device_config
import
DeviceConfig
from
sglang.srt.configs.load_config
import
LoadConfig
from
sglang.srt.configs.model_config
import
AttentionArch
,
ModelConfig
from
sglang.srt.distributed
import
(
get_tp_group
,
init_distributed_environment
,
initialize_model_parallel
,
set_custom_all_reduce
,
)
from
sglang.srt.configs.device_config
import
DeviceConfig
from
sglang.srt.configs.load_config
import
LoadConfig
from
sglang.srt.configs.model_config
import
AttentionArch
,
ModelConfig
from
sglang.srt.distributed.parallel_state
import
monkey_patch_vllm_parallel_state
from
sglang.srt.layers.attention.double_sparsity_backend
import
DoubleSparseAttnBackend
from
sglang.srt.layers.attention.flashinfer_backend
import
FlashInferAttnBackend
from
sglang.srt.layers.attention.torch_native_backend
import
TorchNativeAttnBackend
...
...
@@ -295,12 +296,15 @@ class ModelRunner:
monkey_patch_vllm_gguf_config
()
# Load the model
# Remove monkey_patch when linear.py quant remove dependencies with vllm
monkey_patch_vllm_parallel_state
()
with
self
.
memory_saver_adapter
.
region
():
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
DeviceConfig
(
self
.
device
),
)
monkey_patch_vllm_parallel_state
(
reverse
=
True
)
if
self
.
server_args
.
kv_cache_dtype
==
"fp8_e4m3"
:
if
self
.
server_args
.
quantization_param_path
is
not
None
:
...
...
python/sglang/srt/model_loader/loader.py
View file @
5dc54f1a
...
...
@@ -21,14 +21,14 @@ from huggingface_hub import HfApi, hf_hub_download
from
torch
import
nn
from
transformers
import
AutoModelForCausalLM
,
PretrainedConfig
from
transformers.utils
import
SAFE_WEIGHTS_INDEX_NAME
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
sglang.srt.configs.device_config
import
DeviceConfig
from
sglang.srt.configs.load_config
import
LoadConfig
,
LoadFormat
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.model_loader.utils
import
(
get_model_architecture
,
...
...
@@ -496,7 +496,8 @@ class ShardedStateLoader(BaseModelLoader):
device_config
:
DeviceConfig
,
)
->
nn
.
Module
:
from
safetensors.torch
import
safe_open
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
local_model_path
=
self
.
_prepare_weights
(
model_config
.
model_path
,
model_config
.
revision
...
...
@@ -556,7 +557,8 @@ class ShardedStateLoader(BaseModelLoader):
max_size
:
Optional
[
int
]
=
None
,
)
->
None
:
from
safetensors.torch
import
save_file
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
if
pattern
is
None
:
pattern
=
ShardedStateLoader
.
DEFAULT_PATTERN
...
...
python/sglang/srt/model_loader/weight_utils.py
View file @
5dc54f1a
...
...
@@ -19,10 +19,10 @@ import torch
from
huggingface_hub
import
HfFileSystem
,
hf_hub_download
,
snapshot_download
from
safetensors.torch
import
load_file
,
safe_open
,
save_file
from
tqdm.auto
import
tqdm
from
vllm.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.configs.load_config
import
LoadConfig
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.layers.quantization
import
QuantizationConfig
,
get_quantization_config
from
sglang.srt.utils
import
print_warning_once
...
...
python/sglang/srt/models/baichuan.py
View file @
5dc54f1a
...
...
@@ -24,10 +24,6 @@ from typing import Iterable, Optional, Tuple
import
torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
QKVParallelLinear
,
...
...
@@ -35,6 +31,10 @@ from vllm.model_executor.layers.linear import (
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
...
...
python/sglang/srt/models/chatglm.py
View file @
5dc54f1a
...
...
@@ -21,10 +21,10 @@ from typing import Iterable, Optional, Tuple
import
torch
from
torch
import
nn
from
torch.nn
import
LayerNorm
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.configs
import
ChatGLMConfig
from
sglang.srt.distributed
import
get_tensor_model_parallel_world_size
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
...
...
python/sglang/srt/models/commandr.py
View file @
5dc54f1a
...
...
@@ -44,12 +44,12 @@ import torch.utils.checkpoint
from
torch
import
nn
from
torch.nn.parameter
import
Parameter
from
transformers
import
PretrainedConfig
from
vllm.distributed
import
(
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
python/sglang/srt/models/dbrx.py
View file @
5dc54f1a
...
...
@@ -19,14 +19,14 @@ from typing import Iterable, Optional, Tuple
import
torch
import
torch.nn
as
nn
from
vllm.distributed
import
(
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.configs
import
DbrxConfig
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
,
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.configs
import
DbrxConfig
from
sglang.srt.layers.linear
import
(
QKVParallelLinear
,
ReplicatedLinear
,
...
...
python/sglang/srt/models/deepseek.py
View file @
5dc54f1a
...
...
@@ -21,13 +21,13 @@ from typing import Any, Dict, Iterable, Optional, Tuple
import
torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.distributed
import
(
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
,
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
5dc54f1a
...
...
@@ -23,14 +23,14 @@ import torch.nn.functional as F
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
(
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tp_group
,
tensor_model_parallel_all_reduce
,
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
...
...
python/sglang/srt/models/exaone.py
View file @
5dc54f1a
...
...
@@ -20,9 +20,9 @@ from typing import Any, Dict, Iterable, Optional, Tuple
import
torch
from
torch
import
nn
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
sglang.srt.distributed
import
get_tensor_model_parallel_world_size
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment