[Paddle] Compile with paddlepaddle-gpu 2.6.1 (#1021)

fix 261 compile Signed-off-by: Frank Lin (Engrg-Hardware 1) <eee4017@gmail.com> Co-authored-by: Frank Lin (Engrg-Hardware 1) <fralin@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Paddle] Compile with paddlepaddle-gpu 2.6.1 (#1021)
fix 261 compile Signed-off-by: Frank Lin (Engrg-Hardware 1) <eee4017@gmail.com> Co-authored-by: Frank Lin (Engrg-Hardware 1) <fralin@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
c57a81f0 · Frank Lin · GitHub · 8c0a0c93 · c57a81f0 · c57a81f0
Unverified Commit c57a81f0 authored Jul 18, 2024 by Frank Lin Committed by GitHub Jul 17, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 3 deletions

build_tools/paddle.py build_tools/paddle.py +5 -0

transformer_engine/paddle/csrc/custom_ops.cu transformer_engine/paddle/csrc/custom_ops.cu +17 -3

No files found.
--- a/build_tools/paddle.py
+++ b/build_tools/paddle.py
@@ -9,6 +9,10 @@ import setuptools
 from .utils import cuda_version
+import paddle
+paddle_version = paddle.__version__.replace(".", "")
 def setup_paddle_extension(
    csrc_source_files,
@@ -45,6 +49,7 @@ def setup_paddle_extension(
        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+        f"-DPADDLE_VERSION={paddle_version}",
        "--expt-relaxed-constexpr",
        "--expt-extended-lambda",
        "--use_fast_math",

--- a/transformer_engine/paddle/csrc/custom_ops.cu
+++ b/transformer_engine/paddle/csrc/custom_ops.cu
@@ -595,10 +595,12 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p
  // extract random number generator seed and offset
  const phi::DeviceContext *dev_ctx =
      paddle::experimental::DeviceContextPool::Instance().Get(place);
  phi::Generator *gen_cuda = dev_ctx->GetGenerator();
  auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread);
-  auto state_index = gen_cuda->GetStateIndex();
  int64_t *rng_state_p = static_cast<int64_t *>(rng_state.data());
+#if PADDLE_VERSION > 261
+  auto state_index = gen_cuda->GetStateIndex();
  auto parameterSetter = [gen_cuda, state_index,
                          rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) {
@@ -618,6 +620,9 @@ void UpdateRandomGenerator(phi::Place place, cudaStream_t stream, int rng_elts_p
      };
  phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
                                                                         cudaKernelCallback);
+#else
+  set_rng_state<<<1, 1, 0, stream>>>(0, seed_offset, rng_state_p);
+#endif
 }
 void te_fused_attn_fwd_qkvpacked(const paddle::Tensor &QKV, const paddle::Tensor &cu_seqlens,
@@ -1005,9 +1010,10 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p
  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(Q.place());
  auto gen_cuda = dev_ctx->GetGenerator();
  auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread);
-  auto state_index = gen_cuda->GetStateIndex();
-  auto rng_state_p = static_cast<int64_t *>(rng_state.data());
  auto stream = Q.stream();
+  auto rng_state_p = static_cast<int64_t *>(rng_state.data());
+#if PADDLE_VERSION > 261
+  auto state_index = gen_cuda->GetStateIndex();
  auto parameterSetter = [gen_cuda, state_index,
                          rng_elts_per_thread](phi::backends::gpu::CUDAKernelParams &params) {
    // ensure the generator use correct state index
@@ -1026,6 +1032,9 @@ void te_fused_attn_fwd(const paddle::Tensor &Q, const paddle::Tensor &K, const p
      };
  phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
                                                                         cudaKernelCallback);
+#else
+  set_rng_state<<<1, 1, 0, stream>>>(0, seed_offset, rng_state_p);
+#endif
  auto te_rng_state = MakeNvteTensor(rng_state);
@@ -1354,6 +1363,7 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history,  // NOLI
                                          bool update_weight_scale_inv, bool fwd_update,
                                          float fp8_max, float margin,
                                          const std::string &amax_compute) {
+#if PADDLE_VERSION > 261
  NVTE_CHECK(amax_compute == "max" || amax_compute == "most_recent");
  paddle::Tensor amax;
@@ -1401,6 +1411,10 @@ void amax_and_scale_update_inplace_legacy(paddle::Tensor &amax_history,  // NOLI
      };
  phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(parameterSetter,
                                                                         cudaKernelCallback);
+#else
+  NVTE_ERROR(
+      "amax_and_scale_update_inplace_legacy is not supported in old version of PaddlePaddle\n");
+#endif
 }
 void update_latest_amax_history_inplace(paddle::Tensor &history,  // NOLINT