Merge commit 'a69692ac' of...

Merge commit 'a69692ac' of https://github.com/NVIDIA/TransformerEngine

Merge commit 'a69692ac' of...
Merge commit 'a69692ac' of https://github.com/NVIDIA/TransformerEngine
2b05e121 · yuguo · 0fd441c2 · a69692ac · 2b05e121 · 2b05e121
Commit 2b05e121 authored Jun 17, 2025 by yuguo
20 changed files
--- a/tests/pytorch/test_cpu_offloading.py
+++ b/tests/pytorch/test_cpu_offloading.py
@@ -97,6 +97,8 @@ def _measure_memory_between_forward_and_backward(models, fp8_recipe, cpu_offload
    max_mem_used = torch.cuda.memory_allocated() / (1024**2)
    torch.cuda.synchronize()

+    tensor.sum().backward()
+
    return max_mem_used


@@ -115,6 +117,9 @@ def test_cpu_offload(fp8_recipe, model_key) -> None:
    the difference being the size of the FP8 cache that is not offloaded to the CPU.
    We also expect this memory consumption to be smaller than in scenario (1).
    """
+    import gc
+
+    gc.collect()

    model_cls = model_types[model_key]
    models_list = [model_cls() for _ in range(NUM_LAYERS)]

--- a/tests/pytorch/test_float8_blockwise_scaling_exact.py
+++ b/tests/pytorch/test_float8_blockwise_scaling_exact.py
--- a/tests/pytorch/test_float8_current_scaling_exact.py
+++ b/tests/pytorch/test_float8_current_scaling_exact.py
--- a/tests/pytorch/test_float8blockwisetensor.py
+++ b/tests/pytorch/test_float8blockwisetensor.py
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
--- a/tests/pytorch/test_hf_integration.py
+++ b/tests/pytorch/test_hf_integration.py
--- a/tests/pytorch/test_jit.py
+++ b/tests/pytorch/test_jit.py
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
--- a/tests/pytorch/test_qk_norm.py
+++ b/tests/pytorch/test_qk_norm.py
--- a/tests/pytorch/test_recipe.py
+++ b/tests/pytorch/test_recipe.py
--- a/tests/pytorch/test_sanity.py
+++ b/tests/pytorch/test_sanity.py
--- a/tests/pytorch/utils.py
+++ b/tests/pytorch/utils.py
--- a/transformer_engine/__init__.py
+++ b/transformer_engine/__init__.py
--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
--- a/transformer_engine/common/__init__.py
+++ b/transformer_engine/common/__init__.py
--- a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers.h
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers.h
@@ -111,6 +111,7 @@ struct communicator {
  CUmemGenericAllocationHandle *uchandles[NVTE_MAX_REGIONS];
 #endif
  void *ucbase_ptr[NVTE_MAX_REGIONS];  // only for cuMem allocated memory
+  size_t uc_offsets[NVTE_MAX_REGIONS];
  size_t mem_size[NVTE_MAX_REGIONS];
  bool mem_dealloc[NVTE_MAX_REGIONS];

@@ -133,7 +134,7 @@ struct communicator {
  // max value for running block counters in hostflags
  int basecounter[userbuffers_op_types];  // NOLINT(*)

-  int *flags, *map_flags;
+  int *flags_baseptr, *flags, *map_flags;

  void *mem_mr[NVTE_MAX_REGIONS];


--- a/transformer_engine/common/common.cu
+++ b/transformer_engine/common/common.cu
--- a/transformer_engine/common/common.h
+++ b/transformer_engine/common/common.h