Commit c0c0eb69 authored by gaoqiong's avatar gaoqiong
Browse files

修复channelwise w8a8 预热没有区分卡造成的triton kernel崩溃问题,另外增加block-int8的moe 拿到config步骤

parent 9d5187eb
...@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase): ...@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase):
n=layer.weight.shape[0] n=layer.weight.shape[0]
k=layer.weight.shape[1] k=layer.weight.shape[1]
if {n,k} not in self.tritonsingleton.weight_shapes: if [n,k] not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append({n,k}) self.tritonsingleton.weight_shapes.append([n,k])
json_file=self.tritonsingleton.get_blockint8json_name(n,k,self.block_size[0],self.block_size[1]) json_file=self.tritonsingleton.get_blockint8json_name(n,k,self.block_size[0],self.block_size[1])
configs_dict=self.tritonsingleton.get_blockint8_triton_cache(json_file,n,k,self.block_size[0],self.block_size[1]) configs_dict=self.tritonsingleton.get_blockint8_triton_cache(json_file,n,k,self.block_size[0],self.block_size[1])
...@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase): ...@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
K=x.shape[1] K=x.shape[1]
N=layer.weight.shape[0] N=layer.weight.shape[0]
#print("self.tritonsingleton.triton_json_dict:",self.tritonsingleton.triton_json_dict)
#Get the best config options #Get the best config options
if len(self.tritonsingleton.triton_json_dict)==0: if len(self.tritonsingleton.triton_json_dict)==0:
config=None config=None
...@@ -293,8 +292,6 @@ class BlockInt8LinearMethod(LinearMethodBase): ...@@ -293,8 +292,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
else: else:
config=None config=None
#print("m:{},n:{},k:{},config:{}".format(M,N,K,config))
return apply_w8a8_block_int8_linear( return apply_w8a8_block_int8_linear(
input=x, input=x,
weight=layer.weight, weight=layer.weight,
...@@ -431,6 +428,26 @@ class BlockInt8MoEMethod: ...@@ -431,6 +428,26 @@ class BlockInt8MoEMethod:
def process_weights_after_loading(self, layer: Module) -> None: def process_weights_after_loading(self, layer: Module) -> None:
# Block quant doesn't need to process weights after loading # Block quant doesn't need to process weights after loading
# warmup and get moe block-int8 config # warmup and get moe block-int8 config
E=layer.w13_weight.shape[0]
N1=layer.w13_weight.shape[1]
N2=layer.w2_weight.shape[1]
K=layer.w2_weight.shape[2]
if [E,N1,N2,K] not in self.tritonsingleton.moe_weight_shapes:
self.tritonsingleton.moe_weight_shapes.append([E,N1,N2,K])
TOPK= self.tritonsingleton.topk
block_size=self.quant_config.weight_block_size
json_file=self.tritonsingleton.get_moeblockint8json_name(block_size,E,N1,N2,K,TOPK)
configs_dict=self.tritonsingleton.get_moeblockint8_triton_cache(json_file,block_size,E,N1,N2,K,TOPK)
#warmup
if configs_dict:
self.tritonsingleton.triton_moejson_dict.update(configs_dict)
#生成模型配置文件
self.tritonsingleton.gen_model_json(block_size)
return return
def apply( def apply(
......
...@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase): ...@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
k=layer.weight.shape[1] k=layer.weight.shape[1]
if self.w8a8_strategy==1: if self.w8a8_strategy==1:
if {n,k} not in self.tritonsingleton.weight_shapes: if [n,k] not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append({n,k}) self.tritonsingleton.weight_shapes.append([n,k])
json_file=self.tritonsingleton.get_w8a8json_name(n,k) json_file=self.tritonsingleton.get_w8a8json_name(n,k)
configs_dict=self.tritonsingleton.get_triton_cache(json_file,n,k) configs_dict=self.tritonsingleton.get_triton_cache(json_file,n,k)
...@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase): ...@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
for key, value in configs_dict.items(): for key, value in configs_dict.items():
m=int(key.split('_')[0]) m=int(key.split('_')[0])
ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,best_config=value) ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,device=layer.weight.device,best_config=value)
else: else:
weight_data=layer.weight.data weight_data=layer.weight.data
_weight=weight_data.T.contiguous().reshape(n,-1) _weight=weight_data.T.contiguous().reshape(n,-1)
......
...@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter, ...@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON
class DeepseekV2MLP(nn.Module): class DeepseekV2MLP(nn.Module):
...@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): ...@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
self.model.make_empty_intermediate_tensors) self.model.make_empty_intermediate_tensors)
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1' self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.tritonsingleton= W8a8GetCacheJSON()
self.tritonsingleton.topk = config.num_experts_per_tok
self.tritonsingleton.quant_method=self.quant_method
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids) return self.model.get_input_embeddings(input_ids)
......
...@@ -1872,7 +1872,6 @@ class AtomicCounter: ...@@ -1872,7 +1872,6 @@ class AtomicCounter:
def value(self): def value(self):
return self._value return self._value
class W8a8GetCacheJSON: class W8a8GetCacheJSON:
_instance = None _instance = None
...@@ -1883,13 +1882,68 @@ class W8a8GetCacheJSON: ...@@ -1883,13 +1882,68 @@ class W8a8GetCacheJSON:
return cls._instance return cls._instance
def _initialize(self): def _initialize(self):
from vllm.platforms import current_platform
current_folder_path = os.path.dirname(os.path.abspath(__file__)) current_folder_path = os.path.dirname(os.path.abspath(__file__))
json_folder_path=current_folder_path+'/../lmslim/configs/w8a8' json_folder_path=current_folder_path+'/../lmslim/configs/w8a8'
self.triton_json_dir=(os.getenv('TRITON_JSON_DIR', json_folder_path)) self.triton_json_dir=(os.getenv('TRITON_JSON_DIR', json_folder_path))
self.triton_json_dict={} self.triton_json_dict={}
self.triton_moejson_dict={}
self.triton_json_list=[] self.triton_json_list=[]
self.weight_shapes=[] self.weight_shapes=[]
self.moe_weight_shapes=[]
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
device_name='K100_AI_120'
self.device_name=device_name
self.topk=1
self.quant_method=None
#析构函数,最后会生成model.json的配置文件
def gen_model_json(self,E:int,block_size:Optional[list]=None):
json_dir = os.getenv('LMSLIM_TUNING_JSON', "None")
if json_dir is not "None" and os.path.exists(json_dir):
#生成模型配置文件
logger.info("model_tuning.json is at LMSLIM_TUNING_JSON:%s", json_dir)
config = {
"layers": {
"linear": {
"shapes": [],
"m_range":"None",
},
"moe": {
"shapes": [],
"m_range": "None",
"topk": self.topk
}
},
"quantization_config": {
"quant_method": self.quant_method,
"weight_block_size": "None"
}
}
# 处理 MoE shapes
for shape in self.moe_weight_shapes:
if len(shape) == 4: # 假设 MoE shape 是 [N1, N2,K] 格式
moe_config = {
"E": shape[0],
"N1": shape[1],
"N2": shape[2],
"K": shape[3], # 默认值
}
config["layers"]["moe"]["shapes"].append(moe_config)
for shape in self.weight_shapes:
config["layers"]["linear"]["shapes"].append(shape)
if block_size is not None:
config["quantization_config"]["weight_block_size"]=block_size
with open(json_dir+"/model.json", 'w') as f:
json.dump(config, f, indent=4)
else:
logger.info("LMSLIM_TUNING_JSON is not set")
def getspec_config(self,configs_dict,M,N,K): def getspec_config(self,configs_dict,M,N,K):
if f"{M}_{N}_{K}" in configs_dict: if f"{M}_{N}_{K}" in configs_dict:
...@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON: ...@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON:
for key, value in cachedata.items(): for key, value in cachedata.items():
for sub_key, sub_value in value.items(): for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}" configs_key= f"{sub_key}_{key}"
configs_value={ configs_dict[configs_key]=sub_value
'SPLIT_K': int(sub_value["SPLIT_K"]),
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps'])
}
configs_dict[configs_key]=configs_value
return configs_dict return configs_dict
def get_w8a8json_name(self,n,k): def get_w8a8json_name(self,n,k):
from vllm.platforms import current_platform return self.triton_json_dir+f"/W8A8_{n}_{k}_{self.device_name}.json"
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
device_name='K100_AI_120'
return self.triton_json_dir+f"/W8A8_{n}_{k}_{device_name}.json"
def get_blockint8_triton_cache(self,file_path,n,k,block_n,block_k): def get_blockint8_triton_cache(self,file_path,n,k,block_n,block_k):
cache_json_file=file_path cache_json_file=file_path
...@@ -1947,26 +1988,33 @@ class W8a8GetCacheJSON: ...@@ -1947,26 +1988,33 @@ class W8a8GetCacheJSON:
for key, value in cachedata.items(): for key, value in cachedata.items():
for sub_key, sub_value in value.items(): for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}" configs_key= f"{sub_key}_{key}"
configs_value={ configs_dict[configs_key]=sub_value
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'kpack': int(sub_value["kpack"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps']),
'enable_mmacfuse':int(sub_value['enable_mmacfuse']),
}
configs_dict[configs_key]=configs_value
return configs_dict return configs_dict
def get_blockint8json_name(self,n,k,block_n,block_k): def get_blockint8json_name(self,n,k,block_n,block_k):
from vllm.platforms import current_platform return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{self.device_name}.json"
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120: def get_moeblockint8json_name(self,block_size,E,N1,N2,K,TOPK):
device_name='K100_AI_120' return self.triton_json_dir+f"/MOE_BLOCKINT8[{block_size[0]},{block_size[1]}]_E={E}_N1={N1}_N2={N2}_K={K}_TOPK{TOPK}_{self.device_name}.json"
return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{device_name}.json"
def get_moeblockint8_triton_cache(self,file_path,block_size,E,N1,N2,K,TOPK):
cache_json_file=file_path
if os.path.exists(file_path):
#try:
with open(cache_json_file, 'r') as file:
cachedata = json.load(file)
else:
return None
#把所有的cache解析成key:config的形式:[M_N_K]:[config1,config2]
configs_dict={}
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_dict[configs_key]=sub_value
return configs_dict
# Adapted from: https://stackoverflow.com/a/47212782/5082708 # Adapted from: https://stackoverflow.com/a/47212782/5082708
class LazyDict(Mapping[str, T], Generic[T]): class LazyDict(Mapping[str, T], Generic[T]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment