Commit ec0136e7 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'

解决qwen lora模型推理结果异常问题

See merge request dcutoolkit/deeplearing/vllm!56
parents 7fd1d015 da39222e
......@@ -284,6 +284,11 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "DeepseekV2ForCausalLM":
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Default: Mixtral.
E = config.num_local_experts
......@@ -306,7 +311,7 @@ def main(args: argparse.Namespace):
ray.init(address=None,
ignore_reinit_error=True,
num_gpus=args.tp_size)
num_gpus=1)
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
......
......@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel(
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
// used for tree-style attention
if (attn_masks != nullptr) {
if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) {
qk = -FLT_MAX;
......
......@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt(
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
// used for tree-style attention
if (attn_masks != nullptr) {
if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) {
qk = -FLT_MAX;
......
......@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC(
}
// used for tree-style attention
if (attn_masks != nullptr) {
if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) {
qk_vec[i] = -FLT_MAX;
......
......@@ -42,9 +42,6 @@ def sample_requests(
# Only keep the first two turns of each conversation.
dataset = [data["prompt"] for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
......
......@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
if config.tie_word_embeddings:
self.lm_head = self.model.embed_tokens
else:
# self.lm_head = ParallelLMHead(config.vocab_size,
# config.hidden_size,
# quant_config=quant_config)
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
org_num_embeddings=config.vocab_size,
quant_config=quant_config,
)
quant_config=quant_config)
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment