Commit 5ca1259e authored by zhuwenwen's avatar zhuwenwen
Browse files

update moe configs name

parent db2c32b0
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"48": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"1536": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 2
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 2
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 2
}
}
...@@ -50,7 +50,9 @@ logger = init_logger(__name__) ...@@ -50,7 +50,9 @@ logger = init_logger(__name__)
if envs.VLLM_USE_GLOBAL_CACHE13: if envs.VLLM_USE_GLOBAL_CACHE13:
moe_cache_singleton = None moe_cache_singleton = None
arch_name = torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0]
arch_cu = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count
@torch.compile @torch.compile
def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor): def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
torch.sum(x, dim=1, out=out) torch.sum(x, dim=1, out=out)
...@@ -997,14 +999,14 @@ def get_config_file_name(E: int, ...@@ -997,14 +999,14 @@ def get_config_file_name(E: int,
N: int, N: int,
dtype: Optional[str], dtype: Optional[str],
block_shape: Optional[List[int]] = None, use_nn_moe: Optional[bool] = False) -> str: block_shape: Optional[List[int]] = None, use_nn_moe: Optional[bool] = False) -> str:
device_name = current_platform.get_device_name().replace(" ", "_") # device_name = current_platform.get_device_name().replace(" ", "_")
dtype_selector = "" if not dtype else f",dtype={dtype}" dtype_selector = "" if not dtype else f",dtype={dtype}"
block_shape_selector = ("" if not block_shape or not all(block_shape) else block_shape_selector = ("" if not block_shape or not all(block_shape) else
f",block_shape={block_shape}").replace(" ", "") f",block_shape={block_shape}").replace(" ", "")
if not use_nn_moe: if not use_nn_moe:
return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json" # noqa: E501 return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}.json"
else: else:
return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}_nn.json" return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}_nn.json"
# Adapted from: https://github.com/sgl-project/sglang/pull/2628 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
@functools.lru_cache @functools.lru_cache
......
...@@ -84,9 +84,9 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 ...@@ -84,9 +84,9 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
gpuname = torch.cuda.get_device_properties(torch.cuda.current_device()).name GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
is_kme = gpuname.startswith('K100_AI') or gpuname.startswith('K500SM_AI') is_kme = any(arch in GPU_ARCH for arch in ["gfx928"])
SUPPORT_TC = gpuname.startswith('K100_AI') or gpuname.startswith('K500SM_AI') or gpuname.startswith('BW') SUPPORT_TC = any(arch in GPU_ARCH for arch in ["gfx928", "gfx936"])
def _generate_random_int8( def _generate_random_int8(
tensor: torch.Tensor, tensor: torch.Tensor,
...@@ -1956,9 +1956,10 @@ class W8a8GetCacheJSON: ...@@ -1956,9 +1956,10 @@ class W8a8GetCacheJSON:
self.triton_json_list=[] self.triton_json_list=[]
self.weight_shapes=[] self.weight_shapes=[]
self.moe_weight_shapes=[] self.moe_weight_shapes=[]
device_name = current_platform.get_device_name().replace(" ", "_") arch_name = torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0]
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120: arch_cu = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count
device_name='K100_AI_120'
device_name =arch_name+'_'+str(arch_cu)+'cu'
self.device_name=device_name self.device_name=device_name
self.topk=1 self.topk=1
self.quant_method=None self.quant_method=None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment