Commit b93c3473 authored by zhuwenwen's avatar zhuwenwen
Browse files

解决custom cudagraph模式需要拷贝的问题,需要配合dtk进行使用。

区分pcie和hglink custom allreduce的使用
vllm:export VLLM_CUSTOM_CACHE=1
dtk:export HIP_KERNEL_EVENT_SYSTENFENCE=1
parent 58de8cd6
...@@ -490,6 +490,7 @@ class CustomAllreduce { ...@@ -490,6 +490,7 @@ class CustomAllreduce {
std::map<IPC_KEY, char*> ipc_handles_; std::map<IPC_KEY, char*> ipc_handles_;
uint32_t** dev_curr_hdp_reg; uint32_t** dev_curr_hdp_reg;
hipEvent_t stopEvent;
/** /**
* Signals are an array of ipc-enabled buffers from all ranks. * Signals are an array of ipc-enabled buffers from all ranks.
* For each of the buffer, the layout is as follows: * For each of the buffer, the layout is as follows:
...@@ -518,6 +519,7 @@ class CustomAllreduce { ...@@ -518,6 +519,7 @@ class CustomAllreduce {
hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i); hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i);
} }
} }
cudaEventCreate(&stopEvent);
} }
char* open_ipc_handle(const void* ipc_handle) { char* open_ipc_handle(const void* ipc_handle) {
...@@ -739,9 +741,22 @@ class CustomAllreduce { ...@@ -739,9 +741,22 @@ class CustomAllreduce {
} }
} }
// #define KL(ngpus, name) \
// name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
// rank_, size);
#define KL(ngpus, name) \ #define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \ { \
rank_, size); void* kernelArgs[] = { \
&ptrs, &sg_, &self_sg_, &output, &rank_, &size \
}; \
hipExtLaunchKernel( \
(void*)name<T, ngpus>, \
blocks, threads, \
kernelArgs, 0, \
stream, nullptr, stopEvent, 0 \
); \
}
#define REDUCE_CASE(ngpus) \ #define REDUCE_CASE(ngpus) \
case ngpus: { \ case ngpus: { \
if (force_1stage) { \ if (force_1stage) { \
...@@ -784,6 +799,7 @@ class CustomAllreduce { ...@@ -784,6 +799,7 @@ class CustomAllreduce {
CUDACHECK(cudaIpcCloseMemHandle(ptr)); CUDACHECK(cudaIpcCloseMemHandle(ptr));
} }
cudaFree(dev_curr_hdp_reg); cudaFree(dev_curr_hdp_reg);
cudaEventDestroy(stopEvent);
} }
}; };
......
...@@ -275,7 +275,13 @@ class CustomAllreduce: ...@@ -275,7 +275,13 @@ class CustomAllreduce:
return None return None
if self._IS_CAPTURING: if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing(): if torch.cuda.is_current_stream_capturing():
return self.all_reduce(input, registered=False) if envs.VLLM_CUSTOM_CACHE:
return self.all_reduce(input, registered=True)
else:
if not self.fully_connected:
return self.all_reduce(input, registered=False)
else:
return self.all_reduce(input, registered=True)
else: else:
# If warm up, mimic the allocation pattern since custom # If warm up, mimic the allocation pattern since custom
# allreduce is out-of-place. # allreduce is out-of-place.
......
...@@ -215,6 +215,7 @@ if TYPE_CHECKING: ...@@ -215,6 +215,7 @@ if TYPE_CHECKING:
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_SPEC_DECODE_EAGER: bool = False VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
VLLM_CUSTOM_CACHE: bool = False
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16 VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
VLLM_HAS_CONTEXT_DEFAULT: bool = False VLLM_HAS_CONTEXT_DEFAULT: bool = False
...@@ -1552,6 +1553,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1552,6 +1553,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_PCIE_USE_CUSTOM_ALLREDUCE": "VLLM_PCIE_USE_CUSTOM_ALLREDUCE":
lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "1"))), lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "1"))),
# flag to control vllm to use optimized kernels
"VLLM_CUSTOM_CACHE":
lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX": "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":
lambda: int(os.getenv("VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX", "16")), lambda: int(os.getenv("VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX", "16")),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment