[Bugfix] Fix Phi3.5 mini and MoE LoRA inference (#8571)

18ae428a · Amit Garg · GitHub · de6f90a1 · 18ae428a · 18ae428a
Unverified Commit 18ae428a authored Sep 19, 2024 by Amit Garg Committed by GitHub Sep 20, 2024
3 changed files
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -50,7 +50,7 @@ _GENERATION_MODELS = {
    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
    "PhiForCausalLM": ("phi", "PhiForCausalLM"),
-    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
    "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
    "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),

--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
+# coding=utf-8
+# Adapted from llama.py
+"""Inference-only Phi3 model code inherit from Llama.py"""
+from vllm.model_executor.models.llama import LlamaForCausalLM
+class Phi3ForCausalLM(LlamaForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -491,6 +491,10 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA):
        "o_proj",
        "embed_tokens",
        "lm_head",
+        "w1",
+        "w2",
+        "w3",
+        "gate",
    ]
    embedding_modules = {
        "embed_tokens": "input_embeddings",