Unverified Commit 53d2420b authored by Daniele's avatar Daniele Committed by GitHub
Browse files

[Bugfix] tpu_model_runner: set vllm config context when calling reset_dynamo_cache() (#30331)


Signed-off-by: default avatarDaniele Trifirò <dtrifiro@redhat.com>
parent 9db78f34
...@@ -10,7 +10,7 @@ import torch ...@@ -10,7 +10,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import ( from vllm.distributed import (
ensure_model_parallel_initialized, ensure_model_parallel_initialized,
init_distributed_environment, init_distributed_environment,
...@@ -207,6 +207,7 @@ class TPUWorker: ...@@ -207,6 +207,7 @@ class TPUWorker:
# one compiled bytecode. Having one FX graph/cached bytecode per # one compiled bytecode. Having one FX graph/cached bytecode per
# compiled model is required for `support_torch_compile` decorator to # compiled model is required for `support_torch_compile` decorator to
# skip dynamo guard. # skip dynamo guard.
with set_current_vllm_config(self.vllm_config):
self.model_runner.reset_dynamo_cache() self.model_runner.reset_dynamo_cache()
# Get the maximum amount of memory used by the model weights and # Get the maximum amount of memory used by the model weights and
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment