perf(multimodal): strip inline base64 from TCP transport in frontend decoding (#7895)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

perf(multimodal): strip inline base64 from TCP transport in frontend decoding (#7895)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
6b8545fc · Qi Wang · GitHub · da670d44 · 6b8545fc · 6b8545fc
Unverified Commit 6b8545fc authored Apr 06, 2026 by Qi Wang Committed by GitHub Apr 06, 2026
Show whitespace changes
Inline Side-by-side

Showing with 151 additions and 0 deletions

lib/llm/src/preprocessor.rs lib/llm/src/preprocessor.rs +92 -0

tests/serve/test_vllm.py tests/serve/test_vllm.py +59 -0

No files found.
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -380,6 +380,32 @@ impl OpenAIPreprocessor {
        }
    }
+    /// Replace inline `data:` URLs with empty strings in message content parts.
+    /// Preserves HTTP(S) URLs, text content, and overall message structure.
+    fn strip_inline_data_urls(messages: &mut serde_json::Value) {
+        let Some(arr) = messages.as_array_mut() else {
+            return;
+        };
+        for msg in arr {
+            let Some(content) = msg.get_mut("content") else {
+                continue;
+            };
+            let Some(parts) = content.as_array_mut() else {
+                continue;
+            };
+            for part in parts {
+                for key in ["image_url", "video_url", "audio_url"] {
+                    if let Some(media) = part.get_mut(key)
+                        && let Some(url) = media.get_mut("url")
+                        && url.as_str().is_some_and(|s| s.starts_with("data:"))
+                    {
+                        *url = serde_json::Value::String(String::new());
+                    }
+                }
+            }
+        }
+    }
    pub async fn gather_multi_modal_data<R: OAIChatLikeRequest>(
        &self,
        request: &R,
@@ -458,6 +484,14 @@ impl OpenAIPreprocessor {
            let mut extra_args = serde_json::json!({
                "messages": messages_json
            });
+            // Strip redundant inline data: URLs only when frontend decoding is active
+            // (media_loader decoded the images into RDMA descriptors). TRT-LLM and
+            // other backends that pass URLs through still need the original data: URIs.
+            if self.media_loader.is_some() {
+                Self::strip_inline_data_urls(&mut extra_args["messages"]);
+            }
            if let Some(ref prompt) = formatted_prompt {
                extra_args["formatted_prompt"] = serde_json::Value::String(prompt.clone());
            }
@@ -1549,6 +1583,64 @@ impl
 // Note: tests for jailing and parser detection live in `lib/llm/tests/test_jail.rs`
+#[cfg(test)]
+mod strip_tests {
+    use super::OpenAIPreprocessor;
+    #[test]
+    fn test_strip_inline_data_urls_replaces_data_urls() {
+        let mut messages = serde_json::json!([{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is this?"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR...longdata..."}},
+                {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}}
+            ]
+        }]);
+        OpenAIPreprocessor::strip_inline_data_urls(&mut messages);
+        let parts = messages[0]["content"].as_array().unwrap();
+        assert_eq!(parts[0]["text"], "What is this?");
+        assert_eq!(parts[1]["image_url"]["url"], "");
+        assert_eq!(parts[2]["image_url"]["url"], "https://example.com/img.png");
+    }
+    #[test]
+    fn test_strip_inline_data_urls_handles_video_audio() {
+        let mut messages = serde_json::json!([{
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": {"url": "data:video/mp4;base64,AAAA..."}},
+                {"type": "audio_url", "audio_url": {"url": "https://example.com/audio.wav"}}
+            ]
+        }]);
+        OpenAIPreprocessor::strip_inline_data_urls(&mut messages);
+        let parts = messages[0]["content"].as_array().unwrap();
+        assert_eq!(parts[0]["video_url"]["url"], "");
+        assert_eq!(
+            parts[1]["audio_url"]["url"],
+            "https://example.com/audio.wav"
+        );
+    }
+    #[test]
+    fn test_strip_inline_data_urls_preserves_text_only() {
+        let mut messages = serde_json::json!([{
+            "role": "user",
+            "content": "plain text message"
+        }]);
+        let original = messages.clone();
+        OpenAIPreprocessor::strip_inline_data_urls(&mut messages);
+        assert_eq!(messages, original);
+    }
+    #[test]
+    fn test_strip_inline_data_urls_empty_messages() {
+        let mut messages = serde_json::json!([]);
+        OpenAIPreprocessor::strip_inline_data_urls(&mut messages);
+        assert_eq!(messages, serde_json::json!([]));
+    }
+}
 #[cfg(test)]
 mod tests {
    use super::*;

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -905,6 +905,65 @@ def test_multimodal_b64(
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.pre_merge
+@pytest.mark.timeout(220)
+def test_multimodal_b64_frontend_decoding(
+    request,
+    runtime_services_dynamic_ports,
+    dynamo_dynamic_ports,
+    predownload_models,
+):
+    """
+    Test multimodal inference with base64 images through frontend decoding path.
+    This exercises the Rust frontend image decode + NIXL RDMA transfer path
+    with inline base64 data: URIs (not HTTP URLs). Verifies that the
+    strip_inline_data_urls optimization does not break correctness.
+    """
+    b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
+    b64_payload = chat_payload(
+        [
+            {
+                "type": "text",
+                "text": "What colors are in the following image? Respond only with the colors.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
+            },
+        ],
+        repeat_count=1,
+        expected_response=["green"],
+        temperature=0.0,
+        max_tokens=100,
+    )
+    config = VLLMConfig(
+        name="test_multimodal_b64_frontend_decoding",
+        directory=vllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[],
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        script_args=[
+            "--model",
+            "Qwen/Qwen3-VL-2B-Instruct",
+            "--frontend-decoding",
+        ],
+        delayed_start=0,
+        timeout=220,
+        request_payloads=[b64_payload],
+    )
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
 # LoRA Test Directory
 lora_dir = os.path.join(vllm_dir, "launch/lora")