Revert gemma3n fast prefill changes (#23897)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>

Revert gemma3n fast prefill changes (#23897)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
8c3e1999 · Yong Hoon Shin · GitHub · 1c26b422 · 8c3e1999 · 8c3e1999
Unverified Commit 8c3e1999 authored Aug 29, 2025 by Yong Hoon Shin Committed by GitHub Aug 29, 2025
3 changed files
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig):

 @fork_new_process_for_each_test
 @pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
 def test_kv_sharing_fast_prefill(
    monkeypatch: pytest.MonkeyPatch,
    enforce_eager: bool,

--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -620,7 +620,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
        # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
        # them here, as the model  forward has only access to the input_embeds.
        if input_ids is not None:
-            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
                input_ids)
            per_layer_inputs = per_layer_inputs.reshape(
                -1, self.config.text_config.num_hidden_layers,