fix neuron performance issue (#13589)

6a417b86 · ajayvohra2005 · GitHub · d3ea5011 · 6a417b86
Unverified Commit 6a417b86 authored Feb 20, 2025 by ajayvohra2005 Committed by GitHub Feb 20, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/worker/neuron_worker.py vllm/worker/neuron_worker.py +2 -2

No files found.
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # Set the number of GPU blocks to be the same as the maximum number of
        # sequences that can be processed in a single batch. This is equivalent
        # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1

        # Swap not yet supported with Neuron backend.
        num_cpu_blocks = 0
@@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):

        # Different values are not tested.
        assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks