Unverified Commit c57a81f0 authored by Frank Lin's avatar Frank Lin Committed by GitHub
Browse files

[Paddle] Compile with paddlepaddle-gpu 2.6.1 (#1021)



fix 261 compile
Signed-off-by: default avatarFrank Lin (Engrg-Hardware 1) <eee4017@gmail.com>
Co-authored-by: default avatarFrank Lin (Engrg-Hardware 1) <fralin@nvidia.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 8c0a0c93
...@@ -9,6 +9,10 @@ import setuptools ...@@ -9,6 +9,10 @@ import setuptools
from .utils import cuda_version from .utils import cuda_version
import paddle
paddle_version = paddle.__version__.replace(".", "")
def setup_paddle_extension( def setup_paddle_extension(
csrc_source_files, csrc_source_files,
...@@ -45,6 +49,7 @@ def setup_paddle_extension( ...@@ -45,6 +49,7 @@ def setup_paddle_extension(
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__", "-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__", "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
f"-DPADDLE_VERSION={paddle_version}",
"--expt-relaxed-constexpr", "--expt-relaxed-constexpr",
"--expt-extended-lambda", "--expt-extended-lambda",
"--use_fast_math", "--use_fast_math",
......
...@@ -595,10 +595,12 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p ...@@ -595,10 +595,12 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p
// extract random number generator seed and offset // extract random number generator seed and offset
const phi::DeviceContext *dev_ctx = const phi::DeviceContext *dev_ctx =
paddle::experimental::DeviceContextPool::Instance().Get(place); paddle::experimental::DeviceContextPool::Instance().Get(place);
phi::Generator *gen_cuda = dev_ctx->GetGenerator(); phi::Generator *gen_cuda = dev_ctx->GetGenerator();
auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread); auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread);
auto state_index = gen_cuda->GetStateIndex();
int64_t *rng_state_p = static_cast<int64_t *>(rng_state.data()); int64_t *rng_state_p = static_cast<int64_t *>(rng_state.data());
#if PADDLE_VERSION > 261
auto state_index = gen_cuda->GetStateIndex();
auto parameterSetter = [gen_cuda, state_index, auto parameterSetter = [gen_cuda, state_index,
rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) { rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) {
...@@ -618,6 +620,9 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p ...@@ -618,6 +620,9 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p
}; };
phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter, phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
cudaKernelCallback); cudaKernelCallback);
#else
set_rng_state<<<1, 1, 0, stream>>>(0, seed_offset, rng_state_p);
#endif
} }
void te_fused_attn_fwd_qkvpacked(const paddle::Tensor &QKV, const paddle::Tensor &cu_seqlens, void te_fused_attn_fwd_qkvpacked(const paddle::Tensor &QKV, const paddle::Tensor &cu_seqlens,
...@@ -1005,9 +1010,10 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p ...@@ -1005,9 +1010,10 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(Q.place()); auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(Q.place());
auto gen_cuda = dev_ctx->GetGenerator(); auto gen_cuda = dev_ctx->GetGenerator();
auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread); auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread);
auto state_index = gen_cuda->GetStateIndex();
auto rng_state_p = static_cast<int64_t *>(rng_state.data());
auto stream = Q.stream(); auto stream = Q.stream();
auto rng_state_p = static_cast<int64_t *>(rng_state.data());
#if PADDLE_VERSION > 261
auto state_index = gen_cuda->GetStateIndex();
auto parameterSetter = [gen_cuda, state_index, auto parameterSetter = [gen_cuda, state_index,
rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) { rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) {
// ensure the generator use correct state index // ensure the generator use correct state index
...@@ -1026,6 +1032,9 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p ...@@ -1026,6 +1032,9 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p
}; };
phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter, phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
cudaKernelCallback); cudaKernelCallback);
#else
set_rng_state<<<1, 1, 0, stream>>>(0, seed_offset, rng_state_p);
#endif
auto te_rng_state = MakeNvteTensor(rng_state); auto te_rng_state = MakeNvteTensor(rng_state);
...@@ -1354,6 +1363,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI ...@@ -1354,6 +1363,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI
bool update_weight_scale_inv, bool fwd_update, bool update_weight_scale_inv, bool fwd_update,
float fp8_max, float margin, float fp8_max, float margin,
const std::string &amax_compute) { const std::string &amax_compute) {
#if PADDLE_VERSION > 261
NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent"); NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent");
paddle::Tensor amax; paddle::Tensor amax;
...@@ -1401,6 +1411,10 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI ...@@ -1401,6 +1411,10 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history, // NOLI
}; };
phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter, phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
cudaKernelCallback); cudaKernelCallback);
#else
NVTE_ERROR(
"amax_and_scale_update_inplace_legacy is not supported in old version of PaddlePaddle\n");
#endif
} }
void update_latest_amax_history_inplace(paddle::Tensor &history, // NOLINT void update_latest_amax_history_inplace(paddle::Tensor &history, // NOLINT
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment