Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import ModelRegistry



--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Iterable, List, Optional, Tuple, Union

 import torch

--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional

 import torch

--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional

 import torch

--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
+# SPDX-License-Identifier: Apache-2.0
+
 from setuptools import setup

 setup(

--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Optional



--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.attention.backends.flash_attn import FlashAttentionBackend



--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm.platforms.cuda import CudaPlatform



--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
+# SPDX-License-Identifier: Apache-2.0
+
 import torch

 from tests.kernels.utils import override_backend_env_variable

--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.

 Run `pytest tests/prefix_caching/test_prefix_caching.py`.

--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching.

 Run `pytest tests/prefix_caching/test_prefix_caching.py`.

--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
+# SPDX-License-Identifier: Apache-2.0
+
 import pytest
 import os


--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from ..utils import models_path_prefix

--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import snapshot_download

 from vllm import EngineArgs, LLMEngine, SamplingParams

--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
+# SPDX-License-Identifier: Apache-2.0
 '''Tests whether bitsandbytes computation is enabled correctly.

 Run `pytest tests/quantization/test_bitsandbytes.py`.

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
+# SPDX-License-Identifier: Apache-2.0
 """Test model set-up and weight loading for llmcompressor-quantized models.

 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
+
 from typing import Optional

 import pytest
@@ -24,12 +26,30 @@ from ..utils import models_path_prefix

 @pytest.mark.parametrize(
    "model_args",
-    [(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), "tensor",
-      QuantizationType.INT, 2560, True),
-     (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), "channel",
-      QuantizationType.INT, 2560, True),
-     (os.path.join(models_path_prefix, "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"), "tensor",
-      QuantizationType.INT, 2560, False)])
+    [
+        (
+            os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"),
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"),
+            "channel",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            os.path.join(models_path_prefix, "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"),
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            False,
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
    with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -87,21 +107,31 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        assert output


-@pytest.mark.parametrize("model_path", [
-    os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
-    os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"),
-    os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"),
-    os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym")
-])
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
+        os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"),
+        os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"),
+        os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"),
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
-                                          example_prompts, model_path,
-                                          max_tokens, num_logprobs):
+def test_compressed_tensors_w8a8_logprobs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_path,
+    max_tokens,
+    num_logprobs,
+):
    dtype = "bfloat16"

    # skip language translation prompt for the static per tensor asym model
-    if model_path == os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"):  # noqa: E501
+    if (model_path ==
+            os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym")
+        ):  # noqa: E501
        example_prompts = example_prompts[0:-1]

    with hf_runner(model_path, dtype=dtype) as hf_model:
@@ -127,13 +157,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
        assert output


-@pytest.mark.parametrize("model_args", [
-    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"),
-    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"), "tensor"),
-    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), "channel"),
-    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"),
-     "channel"),
-])
+@pytest.mark.parametrize(
+    "model_args",
+    [
+        (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"),
+        (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"), "tensor"),
+        (
+            os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"),
+            "channel",
+        ),
+        (
+            os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"),
+            "channel",
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
    model_path, strategy = model_args
    with vllm_runner(model_path, dtype=torch.float16) as llm:
@@ -160,9 +198,12 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
                    reason="WNA16 is not supported on ROCm.")
 @pytest.mark.parametrize(
    "wNa16_args",
-    [(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8),
+    [
+     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8),
     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-group128-v2"), "group", 128, 8),
-     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4)])
+     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4),
+    ],
+)
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
    model, strategy, group, pack_factor = wNa16_args
    with vllm_runner(model) as llm:
@@ -226,7 +267,8 @@ def test_compressed_tensors_fp8(vllm_runner):
                              CompressedTensorsLinearMethod)
            assert isinstance(
                qkv_proj.scheme,
-                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
+            )

            assert qkv_proj.input_scale.dtype is torch.float32

@@ -251,9 +293,14 @@ def test_compressed_tensors_kv_cache(vllm_runner):
        assert output


-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+def _test_2of4_quant_models(qkv_proj,
+                            weight_strategy,
+                            input_strategy,
+                            format="dense"):
    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
    assert isinstance(qkv_proj.scheme, CompressedTensors24)

@@ -262,22 +309,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
    assert qkv_proj.scheme.quantized
    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").format == format
    assert sparsity_map.get("Linear").sparsity_structure == "2:4"


-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
-     "token"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-     "channel", "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
@@ -296,16 +360,134 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
        assert output


-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-     "channel", "token"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
@@ -327,10 +509,12 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(
    not sparse_cutlass_supported(),
-    reason="2of4 Sparse is not yet supported on this GPU type.")
+    reason="2of4 Sparse is not yet supported on this GPU type.",
+)
 @pytest.mark.parametrize(
    "args_2of4",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
+)
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
    model = args_2of4
    with vllm_runner(model) as llm:
@@ -347,7 +531,9 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
            assert qkv_proj.scheme.input_quant is None
            assert not qkv_proj.scheme.quantized
            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
            assert sparsity_map.get("Linear").format == "dense"
            assert sparsity_map.get("Linear").sparsity_structure == "2:4"

@@ -356,3 +542,38 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
+            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether Marlin models can be loaded from the autogptq config.

 Run `pytest tests/quantization/test_configs.py --forked`.

--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
+# SPDX-License-Identifier: Apache-2.0
+
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py


--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
+# SPDX-License-Identifier: Apache-2.0
+
 # flake8: noqa
 """Tests experts_int8 quantization startup and generation, 
 doesn't test correctness

--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether FP8 computation is enabled correctly.

 Run `pytest tests/quantization/test_fp8.py --forked`.