Commit 6781a21e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'origin/0.9.1-dev-w8a8' into v0.9.1-dev

parents f9795c8c c0c0eb69
......@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase):
n=layer.weight.shape[0]
k=layer.weight.shape[1]
if {n,k} not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append({n,k})
if [n,k] not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append([n,k])
json_file=self.tritonsingleton.get_blockint8json_name(n,k,self.block_size[0],self.block_size[1])
configs_dict=self.tritonsingleton.get_blockint8_triton_cache(json_file,n,k,self.block_size[0],self.block_size[1])
......@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
K=x.shape[1]
N=layer.weight.shape[0]
#print("self.tritonsingleton.triton_json_dict:",self.tritonsingleton.triton_json_dict)
#Get the best config options
if len(self.tritonsingleton.triton_json_dict)==0:
config=None
......@@ -292,9 +291,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
else:
config=None
#print("m:{},n:{},k:{},config:{}".format(M,N,K,config))
return apply_w8a8_block_int8_linear(
input=x,
weight=layer.weight,
......@@ -431,6 +428,26 @@ class BlockInt8MoEMethod:
def process_weights_after_loading(self, layer: Module) -> None:
# Block quant doesn't need to process weights after loading
# warmup and get moe block-int8 config
E=layer.w13_weight.shape[0]
N1=layer.w13_weight.shape[1]
N2=layer.w2_weight.shape[1]
K=layer.w2_weight.shape[2]
if [E,N1,N2,K] not in self.tritonsingleton.moe_weight_shapes:
self.tritonsingleton.moe_weight_shapes.append([E,N1,N2,K])
TOPK= self.tritonsingleton.topk
block_size=self.quant_config.weight_block_size
json_file=self.tritonsingleton.get_moeblockint8json_name(block_size,E,N1,N2,K,TOPK)
configs_dict=self.tritonsingleton.get_moeblockint8_triton_cache(json_file,block_size,E,N1,N2,K,TOPK)
#warmup
if configs_dict:
self.tritonsingleton.triton_moejson_dict.update(configs_dict)
#生成模型配置文件
self.tritonsingleton.gen_model_json(block_size)
return
def apply(
......
......@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
k=layer.weight.shape[1]
if self.w8a8_strategy==1:
if {n,k} not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append({n,k})
if [n,k] not in self.tritonsingleton.weight_shapes:
self.tritonsingleton.weight_shapes.append([n,k])
json_file=self.tritonsingleton.get_w8a8json_name(n,k)
configs_dict=self.tritonsingleton.get_triton_cache(json_file,n,k)
......@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
for key, value in configs_dict.items():
m=int(key.split('_')[0])
ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,best_config=value)
ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,device=layer.weight.device,best_config=value)
else:
weight_data=layer.weight.data
_weight=weight_data.T.contiguous().reshape(n,-1)
......
......@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON
class DeepseekV2MLP(nn.Module):
......@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
self.model.make_empty_intermediate_tensors)
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.tritonsingleton= W8a8GetCacheJSON()
self.tritonsingleton.topk = config.num_experts_per_tok
self.tritonsingleton.quant_method=self.quant_method
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids)
......
......@@ -1872,7 +1872,6 @@ class AtomicCounter:
def value(self):
return self._value
class W8a8GetCacheJSON:
_instance = None
......@@ -1883,14 +1882,69 @@ class W8a8GetCacheJSON:
return cls._instance
def _initialize(self):
from vllm.platforms import current_platform
current_folder_path = os.path.dirname(os.path.abspath(__file__))
json_folder_path=current_folder_path+'/../lmslim/configs/w8a8'
self.triton_json_dir=(os.getenv('TRITON_JSON_DIR', json_folder_path))
self.triton_json_dict={}
self.triton_moejson_dict={}
self.triton_json_list=[]
self.weight_shapes=[]
self.moe_weight_shapes=[]
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
device_name='K100_AI_120'
self.device_name=device_name
self.topk=1
self.quant_method=None
#析构函数,最后会生成model.json的配置文件
def gen_model_json(self,E:int,block_size:Optional[list]=None):
json_dir = os.getenv('LMSLIM_TUNING_JSON', "None")
if json_dir is not "None" and os.path.exists(json_dir):
#生成模型配置文件
logger.info("model_tuning.json is at LMSLIM_TUNING_JSON:%s", json_dir)
config = {
"layers": {
"linear": {
"shapes": [],
"m_range":"None",
},
"moe": {
"shapes": [],
"m_range": "None",
"topk": self.topk
}
},
"quantization_config": {
"quant_method": self.quant_method,
"weight_block_size": "None"
}
}
# 处理 MoE shapes
for shape in self.moe_weight_shapes:
if len(shape) == 4: # 假设 MoE shape 是 [N1, N2,K] 格式
moe_config = {
"E": shape[0],
"N1": shape[1],
"N2": shape[2],
"K": shape[3], # 默认值
}
config["layers"]["moe"]["shapes"].append(moe_config)
for shape in self.weight_shapes:
config["layers"]["linear"]["shapes"].append(shape)
if block_size is not None:
config["quantization_config"]["weight_block_size"]=block_size
with open(json_dir+"/model.json", 'w') as f:
json.dump(config, f, indent=4)
else:
logger.info("LMSLIM_TUNING_JSON is not set")
def getspec_config(self,configs_dict,M,N,K):
if f"{M}_{N}_{K}" in configs_dict:
return configs_dict[f"{M}_{N}_{K}"]
......@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON:
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_value={
'SPLIT_K': int(sub_value["SPLIT_K"]),
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps'])
}
configs_dict[configs_key]=configs_value
configs_dict[configs_key]=sub_value
return configs_dict
def get_w8a8json_name(self,n,k):
from vllm.platforms import current_platform
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
device_name='K100_AI_120'
return self.triton_json_dir+f"/W8A8_{n}_{k}_{device_name}.json"
return self.triton_json_dir+f"/W8A8_{n}_{k}_{self.device_name}.json"
def get_blockint8_triton_cache(self,file_path,n,k,block_n,block_k):
cache_json_file=file_path
......@@ -1947,27 +1988,34 @@ class W8a8GetCacheJSON:
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_value={
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'kpack': int(sub_value["kpack"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps']),
'enable_mmacfuse':int(sub_value['enable_mmacfuse']),
}
configs_dict[configs_key]=configs_value
configs_dict[configs_key]=sub_value
return configs_dict
def get_blockint8json_name(self,n,k,block_n,block_k):
from vllm.platforms import current_platform
device_name = current_platform.get_device_name().replace(" ", "_")
if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
device_name='K100_AI_120'
return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{device_name}.json"
return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{self.device_name}.json"
def get_moeblockint8json_name(self,block_size,E,N1,N2,K,TOPK):
return self.triton_json_dir+f"/MOE_BLOCKINT8[{block_size[0]},{block_size[1]}]_E={E}_N1={N1}_N2={N2}_K={K}_TOPK{TOPK}_{self.device_name}.json"
def get_moeblockint8_triton_cache(self,file_path,block_size,E,N1,N2,K,TOPK):
cache_json_file=file_path
if os.path.exists(file_path):
#try:
with open(cache_json_file, 'r') as file:
cachedata = json.load(file)
else:
return None
#把所有的cache解析成key:config的形式:[M_N_K]:[config1,config2]
configs_dict={}
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_dict[configs_key]=sub_value
return configs_dict
# Adapted from: https://stackoverflow.com/a/47212782/5082708
class LazyDict(Mapping[str, T], Generic[T]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment