Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
be0124bd
Unverified
Commit
be0124bd
authored
Nov 24, 2024
by
Lianmin Zheng
Committed by
GitHub
Nov 24, 2024
Browse files
Rename triton_fused_moe -> fused_moe_triton (#2163)
parent
fe5d3e81
Changes
76
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
12 additions
and
12 deletions
+12
-12
python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
...configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
+0
-0
python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
...168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+0
-0
python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
...configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
+0
-0
python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
...192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+0
-0
python/sglang/srt/layers/fused_moe_triton/configs/README
python/sglang/srt/layers/fused_moe_triton/configs/README
+0
-0
python/sglang/srt/layers/fused_moe_triton/fused_moe.py
python/sglang/srt/layers/fused_moe_triton/fused_moe.py
+1
-1
python/sglang/srt/layers/fused_moe_triton/layer.py
python/sglang/srt/layers/fused_moe_triton/layer.py
+2
-2
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+1
-1
python/sglang/srt/models/dbrx.py
python/sglang/srt/models/dbrx.py
+1
-1
python/sglang/srt/models/deepseek.py
python/sglang/srt/models/deepseek.py
+1
-1
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+1
-1
python/sglang/srt/models/grok.py
python/sglang/srt/models/grok.py
+1
-1
python/sglang/srt/models/mixtral.py
python/sglang/srt/models/mixtral.py
+1
-1
python/sglang/srt/models/olmoe.py
python/sglang/srt/models/olmoe.py
+1
-1
python/sglang/srt/models/qwen2_moe.py
python/sglang/srt/models/qwen2_moe.py
+1
-1
python/sglang/srt/models/xverse_moe.py
python/sglang/srt/models/xverse_moe.py
+1
-1
No files found.
python/sglang/srt/layers/
triton_
fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
→
python/sglang/srt/layers/fused_moe
_triton
/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
View file @
be0124bd
File moved
python/sglang/srt/layers/
triton_
fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
→
python/sglang/srt/layers/fused_moe
_triton
/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
View file @
be0124bd
File moved
python/sglang/srt/layers/
triton_
fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
→
python/sglang/srt/layers/fused_moe
_triton
/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
View file @
be0124bd
File moved
python/sglang/srt/layers/
triton_
fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
→
python/sglang/srt/layers/fused_moe
_triton
/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
View file @
be0124bd
File moved
python/sglang/srt/layers/
triton_
fused_moe/configs/README
→
python/sglang/srt/layers/fused_moe
_triton
/configs/README
View file @
be0124bd
File moved
python/sglang/srt/layers/
triton_
fused_moe/fused_moe.py
→
python/sglang/srt/layers/fused_moe
_triton
/fused_moe.py
View file @
be0124bd
...
...
@@ -376,7 +376,7 @@ def try_get_optimal_moe_config(
M
:
int
,
is_marlin
:
bool
=
False
,
):
from
sglang.srt.layers.
triton_
fused_moe
import
get_config
from
sglang.srt.layers.fused_moe
_triton
import
get_config
override_config
=
get_config
()
if
override_config
:
...
...
python/sglang/srt/layers/
triton_
fused_moe/layer.py
→
python/sglang/srt/layers/fused_moe
_triton
/layer.py
View file @
be0124bd
...
...
@@ -20,7 +20,7 @@ from sglang.srt.layers.quantization.base_config import (
from
sglang.srt.utils
import
set_weight_attrs
if
torch
.
cuda
.
is_available
()
or
torch
.
hip
.
is_available
():
from
sglang.srt.layers.
triton_
fused_moe.fused_moe
import
fused_experts
from
sglang.srt.layers.fused_moe
_triton
.fused_moe
import
fused_experts
else
:
fused_experts
=
None
# type: ignore
...
...
@@ -514,7 +514,7 @@ class FusedMoE(torch.nn.Module):
num_expert_group
:
Optional
[
int
]
=
None
,
custom_routing_function
:
Optional
[
Callable
]
=
None
,
):
from
sglang.srt.layers.
triton_
fused_moe.fused_moe
import
(
from
sglang.srt.layers.fused_moe
_triton
.fused_moe
import
(
fused_topk
,
grouped_topk
,
)
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
be0124bd
...
...
@@ -68,7 +68,7 @@ def fp8_get_quant_method(self, layer, prefix):
is_layer_skipped
,
)
from
sglang.srt.layers.
triton_
fused_moe.layer
import
FusedMoE
from
sglang.srt.layers.fused_moe
_triton
.layer
import
FusedMoE
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
,
self
.
ignored_layers
):
...
...
python/sglang/srt/models/dbrx.py
View file @
be0124bd
...
...
@@ -28,6 +28,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.transformers_utils.configs.dbrx
import
DbrxConfig
from
sglang.srt.layers.fused_moe_triton
import
fused_moe
from
sglang.srt.layers.linear
import
(
QKVParallelLinear
,
ReplicatedLinear
,
...
...
@@ -36,7 +37,6 @@ from sglang.srt.layers.linear import (
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.triton_fused_moe
import
fused_moe
from
sglang.srt.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
...
...
python/sglang/srt/models/deepseek.py
View file @
be0124bd
...
...
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.fused_moe_triton
import
fused_moe
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -40,7 +41,6 @@ from sglang.srt.layers.linear import (
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.triton_fused_moe
import
fused_moe
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
be0124bd
...
...
@@ -31,6 +31,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -41,7 +42,6 @@ from sglang.srt.layers.linear import (
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.triton_fused_moe
import
FusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
python/sglang/srt/models/grok.py
View file @
be0124bd
...
...
@@ -31,7 +31,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.model_loader.loader
import
DefaultModelLoader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.fused_moe
import
FusedMoE
from
sglang.srt.layers.fused_moe
_grok
import
FusedMoE
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
QKVParallelLinear
,
...
...
python/sglang/srt/models/mixtral.py
View file @
be0124bd
...
...
@@ -25,6 +25,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
QKVParallelLinear
,
...
...
@@ -35,7 +36,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.torchao_utils
import
apply_torchao_config_
from
sglang.srt.layers.triton_fused_moe
import
FusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
python/sglang/srt/models/olmoe.py
View file @
be0124bd
...
...
@@ -38,11 +38,11 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.utils
import
print_warning_once
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
,
LogitsProcessorOutput
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.triton_fused_moe
import
FusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
python/sglang/srt/models/qwen2_moe.py
View file @
be0124bd
...
...
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -41,7 +42,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.torchao_utils
import
apply_torchao_config_
from
sglang.srt.layers.triton_fused_moe
import
FusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
python/sglang/srt/models/xverse_moe.py
View file @
be0124bd
...
...
@@ -34,10 +34,10 @@ from vllm.model_executor.layers.linear import (
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.layers.fused_moe_triton
import
fused_moe
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.triton_fused_moe
import
fused_moe
from
sglang.srt.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment