Unverified Commit 6a417b86 authored by ajayvohra2005's avatar ajayvohra2005 Committed by GitHub
Browse files

fix neuron performance issue (#13589)

parent d3ea5011
......@@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
# Set the number of GPU blocks to be the same as the maximum number of
# sequences that can be processed in a single batch. This is equivalent
# to schedule without PagedAttention.
num_gpu_blocks = self.scheduler_config.max_num_seqs
num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
# Swap not yet supported with Neuron backend.
num_cpu_blocks = 0
......@@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
# Different values are not tested.
assert num_cpu_blocks == 0
assert num_gpu_blocks == self.scheduler_config.max_num_seqs
assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment