Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/v1/test_stats.py
+++ b/tests/v1/test_stats.py
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest

 from vllm.sampling_params import SamplingParams

--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List

 import torch

--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Set, Tuple

 import numpy as np

--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup

 setup(

--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
+# SPDX-License-Identifier: Apache-2.0
 """
 vllm_utils is a package for vLLM testing utilities.
 It does not import any vLLM modules.

--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys

--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import dataclasses
 import sys

--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os

 import pytest

--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import List


--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import List, Tuple, Type


--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List

 import pytest
@@ -25,6 +27,15 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
    return model_runner


+def test_deepseek_mla_attn_backend_module():
+    model_runner = _create_model_runner(
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        trust_remote_code=True,
+        enable_chunked_prefill=False,
+    )
+    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
+
+
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_prompt(batch_size):
    model_runner = _create_model_runner(

--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import torch


--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
+# SPDX-License-Identifier: Apache-2.0
+
 import torch
 import os


--- a/tools/check_spdx_header.py
+++ b/tools/check_spdx_header.py
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0"
+SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
+
+
+def check_spdx_header(file_path):
+    with open(file_path, encoding='UTF-8') as file:
+        lines = file.readlines()
+        if not lines:
+            # Empty file like __init__.py
+            return True
+        for line in lines:
+            if line.strip().startswith(SPDX_HEADER_PREFIX):
+                return True
+    return False
+
+
+def add_header(file_path):
+    with open(file_path, 'r+', encoding='UTF-8') as file:
+        lines = file.readlines()
+        file.seek(0, 0)
+        if lines and lines[0].startswith("#!"):
+            file.write(lines[0])
+            file.write(SPDX_HEADER + '\n')
+            file.writelines(lines[1:])
+        else:
+            file.write(SPDX_HEADER + '\n')
+            file.writelines(lines)
+
+
+def main():
+    files_with_missing_header = []
+    for file_path in sys.argv[1:]:
+        if not check_spdx_header(file_path):
+            files_with_missing_header.append(file_path)
+
+    if files_with_missing_header:
+        print("The following files are missing the SPDX header:")
+        for file_path in files_with_missing_header:
+            print(f"  {file_path}")
+            add_header(file_path)
+
+    sys.exit(1 if files_with_missing_header else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from typing import Dict

--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json

--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
 #!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright (c) 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

--- a/use_existing_torch.py
+++ b/use_existing_torch.py
+# SPDX-License-Identifier: Apache-2.0
+
 import glob

 requires_files = glob.glob('requirements*.txt')

--- a/vllm/__init__.py
+++ b/vllm/__init__.py
+# SPDX-License-Identifier: Apache-2.0
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 import os


--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
+# SPDX-License-Identifier: Apache-2.0
+
 import contextlib
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Type
@@ -1369,6 +1371,15 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                                          num_tokens_post_pad)


+def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
+                             block_size: int, sorted_token_ids: torch.Tensor,
+                             experts_ids: torch.Tensor,
+                             num_tokens_post_pad: torch.Tensor) -> None:
+    torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts,
+                                              block_size, sorted_token_ids,
+                                              experts_ids, num_tokens_post_pad)
+
+
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                 token_expert_indicies: torch.Tensor,
                 gating_output: float) -> None:
@@ -1445,6 +1456,11 @@ def copy_blocks(key_caches: List[torch.Tensor],
    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)


+def copy_blocks_mla(kv_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+
+
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                block_mapping: torch.Tensor) -> None:
    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)