Commit c5fa1992 authored by liuchy5's avatar liuchy5
Browse files

支持fp8 mqa&&跳过VLLM_USE_FUSED_FILL_RMS_CAT&&跳过load_error

parent 3824b261
......@@ -112,6 +112,29 @@ def sparse_attn_indexer(
chunk.cu_seqlen_ks,
chunk.cu_seqlen_ke,
)
elif torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":
k_fp8 = k_fp8_full[: chunk.total_seq_lens]
k_scale = k_scale_full[: chunk.total_seq_lens]
ops.cp_gather_indexer_k_quant_cache(
kv_cache,
k_fp8,
k_scale,
chunk.block_table,
chunk.cu_seq_lens,
)
logits = op.mqa_logits(
q_fp8[chunk.token_start:chunk.token_end],
k_fp8,
weights[chunk.token_start:chunk.token_end],
chunk.cu_seqlen_ks,
chunk.cu_seqlen_ke,
q_fp8[chunk.token_start:chunk.token_end].shape[0],
k_fp8.shape[0],
q_fp8.shape[1],
q_fp8.shape[2],
k_scale.view(torch.float32).flatten(),
True
)
else:
logits = op.mqa_logits(
q_fp8[chunk.token_start:chunk.token_end],
......
......@@ -298,6 +298,7 @@ class DefaultModelLoader(BaseModelLoader):
if model_config.quantization is None and loaded_weights is not None:
weights_not_loaded = weights_to_load - loaded_weights
weights_not_loaded = {k for k in weights_not_loaded if not k.endswith("indexer.weights_proj.bias")}
weights_not_loaded = {k for k in weights_not_loaded if k not in ['model.layers.78.shared_head.head.weight', 'model.embed_tokens.weight']}
if weights_not_loaded:
raise ValueError(
"Following weights were not initialized from "
......
......@@ -112,7 +112,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
) -> torch.Tensor:
assert inputs_embeds is not None
# masking inputs at position 0, as not needed by MTP
if envs.VLLM_USE_FUSED_FILL_RMS_CAT:
if False:
hidden_states_fuse = torch.empty(inputs_embeds.shape[0], inputs_embeds.shape[1]*2, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
torch.ops.vllm.fuse_fill_rms_x2_concat(hidden_states_fuse, positions, inputs_embeds, previous_hidden_states, self.enorm.weight, self.hnorm.weight, self.enorm.variance_epsilon)
hidden_states = self.eh_proj(hidden_states_fuse)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment