Unverified Commit 52df63b7 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

fix vllm (#2708)

* fix vllm

* fix data_parallel

* copy to multimodal
parent 41b952f3
......@@ -109,7 +109,7 @@ class VLLM(TemplateLM):
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
self.model_args["worker_use_ray"] = True
self.model_args["distributed_executor_backend"] = "ray"
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
......@@ -246,9 +246,7 @@ class VLLM(TemplateLM):
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
@ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
def run_inference_one_model(
model_args: dict,
sampling_params,
......
......@@ -109,9 +109,7 @@ class VLLM_VLM(VLLM):
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
@ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[dict]]
):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment