Commit 2b05e121 authored by yuguo's avatar yuguo
Browse files

Merge commit 'a69692ac' of...

Merge commit 'a69692ac' of https://github.com/NVIDIA/TransformerEngine
parents 0fd441c2 a69692ac
......@@ -97,6 +97,8 @@ def _measure_memory_between_forward_and_backward(models, fp8_recipe, cpu_offload
max_mem_used = torch.cuda.memory_allocated() / (1024**2)
torch.cuda.synchronize()
tensor.sum().backward()
return max_mem_used
......@@ -115,6 +117,9 @@ def test_cpu_offload(fp8_recipe, model_key) -> None:
the difference being the size of the FP8 cache that is not offloaded to the CPU.
We also expect this memory consumption to be smaller than in scenario (1).
"""
import gc
gc.collect()
model_cls = model_types[model_key]
models_list = [model_cls() for _ in range(NUM_LAYERS)]
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -111,6 +111,7 @@ struct communicator {
CUmemGenericAllocationHandle *uchandles[NVTE_MAX_REGIONS];
#endif
void *ucbase_ptr[NVTE_MAX_REGIONS]; // only for cuMem allocated memory
size_t uc_offsets[NVTE_MAX_REGIONS];
size_t mem_size[NVTE_MAX_REGIONS];
bool mem_dealloc[NVTE_MAX_REGIONS];
......@@ -133,7 +134,7 @@ struct communicator {
// max value for running block counters in hostflags
int basecounter[userbuffers_op_types]; // NOLINT(*)
int *flags, *map_flags;
int *flags_baseptr, *flags, *map_flags;
void *mem_mr[NVTE_MAX_REGIONS];
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment