Unverified Commit 42bb201f authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent 59d6bb4c
...@@ -57,11 +57,13 @@ class InputBatch: ...@@ -57,11 +57,13 @@ class InputBatch:
# TODO(woosuk): This buffer could be too large if max_model_len is big. # TODO(woosuk): This buffer could be too large if max_model_len is big.
# Find a way to reduce the CPU memory usage. # Find a way to reduce the CPU memory usage.
# This buffer is not directly transferred to the GPU, so it does not
# need to be pinned.
self.token_ids_cpu_tensor = torch.zeros( self.token_ids_cpu_tensor = torch.zeros(
(max_num_reqs, max_model_len), (max_num_reqs, max_model_len),
device="cpu", device="cpu",
dtype=torch.int32, dtype=torch.int32,
pin_memory=pin_memory, pin_memory=False,
) )
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment