Commit 1b5aa25e authored by xiabo's avatar xiabo
Browse files

区分pcie和hglink custom allreduce的使用

parent c6a9b490
......@@ -264,10 +264,13 @@ class CustomAllreduce:
return None
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
if not envs.VLLM_CUSTOM_CACHE:
return self.all_reduce(input, registered=False)
else:
if envs.VLLM_CUSTOM_CACHE:
return self.all_reduce(input, registered=True)
else:
if not self.fully_connected:
return self.all_reduce(input, registered=False)
else:
return self.all_reduce(input, registered=True)
else:
# If warm up, mimic the allocation pattern since custom
# allreduce is out-of-place.
......
......@@ -1080,7 +1080,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# flag to control vllm to use optimized kernels
"VLLM_CUSTOM_CACHE":
lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "1"))),
lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
# flag to control vllm to use optimized kernels
"VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment