Commit 06106338 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-custom' into 'v0.9.2-dev'

解决custom cudagraph模式需要拷贝的问题,这个需要配合dtk进行使用。

See merge request dcutoolkit/deeplearing/vllm!306
parents afdabfbe 651925e8
...@@ -902,6 +902,7 @@ class CustomAllreduce { ...@@ -902,6 +902,7 @@ class CustomAllreduce {
std::map<IPC_KEY, char*> ipc_handles_; std::map<IPC_KEY, char*> ipc_handles_;
uint32_t** dev_curr_hdp_reg; uint32_t** dev_curr_hdp_reg;
hipEvent_t stopEvent;
/** /**
* Signals are an array of ipc-enabled buffers from all ranks. * Signals are an array of ipc-enabled buffers from all ranks.
* For each of the buffer, the layout is as follows: * For each of the buffer, the layout is as follows:
...@@ -930,6 +931,7 @@ class CustomAllreduce { ...@@ -930,6 +931,7 @@ class CustomAllreduce {
hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i); hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i);
} }
} }
cudaEventCreate(&stopEvent);
} }
char* open_ipc_handle(const void* ipc_handle) { char* open_ipc_handle(const void* ipc_handle) {
...@@ -1303,9 +1305,22 @@ class CustomAllreduce { ...@@ -1303,9 +1305,22 @@ class CustomAllreduce {
size /= d; size /= d;
auto bytes = size * sizeof(typename packed_t<T>::P); auto bytes = size * sizeof(typename packed_t<T>::P);
int blocks = std::min(block_limit, (size + threads - 1) / threads); int blocks = std::min(block_limit, (size + threads - 1) / threads);
// #define KL(ngpus, name) \
// name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
// rank_, size);
#define KL(ngpus, name) \ #define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \ { \
rank_, size); void* kernelArgs[] = { \
&ptrs, &sg_, &self_sg_, &output, &rank_, &size \
}; \
hipExtLaunchKernel( \
(void*)name<T, ngpus>, \
blocks, threads, \
kernelArgs, 0, \
stream, nullptr, stopEvent, 0 \
); \
}
#define REDUCE_CASE(ngpus) \ #define REDUCE_CASE(ngpus) \
case ngpus: { \ case ngpus: { \
if (world_size_ == 2) { \ if (world_size_ == 2) { \
...@@ -1342,6 +1357,7 @@ class CustomAllreduce { ...@@ -1342,6 +1357,7 @@ class CustomAllreduce {
CUDACHECK(cudaIpcCloseMemHandle(ptr)); CUDACHECK(cudaIpcCloseMemHandle(ptr));
} }
cudaFree(dev_curr_hdp_reg); cudaFree(dev_curr_hdp_reg);
cudaEventDestroy(stopEvent);
} }
}; };
......
...@@ -264,7 +264,10 @@ class CustomAllreduce: ...@@ -264,7 +264,10 @@ class CustomAllreduce:
return None return None
if self._IS_CAPTURING: if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing(): if torch.cuda.is_current_stream_capturing():
return self.all_reduce(input, registered=False) if not envs.VLLM_CUSTOM_CACHE:
return self.all_reduce(input, registered=False)
else:
return self.all_reduce(input, registered=True)
else: else:
# If warm up, mimic the allocation pattern since custom # If warm up, mimic the allocation pattern since custom
# allreduce is out-of-place. # allreduce is out-of-place.
......
...@@ -152,6 +152,7 @@ if TYPE_CHECKING: ...@@ -152,6 +152,7 @@ if TYPE_CHECKING:
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_SPEC_DECODE_EAGER: bool = False VLLM_SPEC_DECODE_EAGER: bool = False
VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
VLLM_CUSTOM_CACHE: bool = False
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16 VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
VLLM_HAS_CONTEXT_DEFAULT: bool = False VLLM_HAS_CONTEXT_DEFAULT: bool = False
...@@ -1065,6 +1066,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1065,6 +1066,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_PCIE_USE_CUSTOM_ALLREDUCE": "VLLM_PCIE_USE_CUSTOM_ALLREDUCE":
lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "0"))), lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "0"))),
# flag to control vllm to use optimized kernels
"VLLM_CUSTOM_CACHE":
lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX": "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment