"vscode:/vscode.git/clone" did not exist on "a75da0ca15000f786d908a4d285f9e67edc3555c"
Unverified Commit e3d2efd7 authored by Rachit Garg's avatar Rachit Garg Committed by GitHub
Browse files

add external margin (#713)



Add envvar for SM margin in GEMM
Signed-off-by: default avatarRachit Garg <rachitg@nvidia.com>
Co-authored-by: default avatarRachit Garg <rachitg@nvidia.com>
parent a38b291b
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <torch/types.h> #include <torch/types.h>
#include "common/util/logging.h" #include "common/util/logging.h"
#include "common/util/system.h"
#include "userbuffers/userbuffers.h" #include "userbuffers/userbuffers.h"
#define HALF_BYTES 2 #define HALF_BYTES 2
...@@ -112,6 +113,7 @@ struct UbufCommOverlap : torch::CustomClassHolder, UbufBase { ...@@ -112,6 +113,7 @@ struct UbufCommOverlap : torch::CustomClassHolder, UbufBase {
cudaDeviceProp prop; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0); cudaGetDeviceProperties(&prop, 0);
_math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount; _math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount;
_math_sms -= transformer_engine::getenv<int>("NVTE_EXT_MARGIN_SM", 0);
output_tensor = torch::Tensor(); output_tensor = torch::Tensor();
auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA); auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
...@@ -587,6 +589,7 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder, UbufBase { ...@@ -587,6 +589,7 @@ struct UbufP2PCommOverlap : torch::CustomClassHolder, UbufBase {
cudaDeviceProp prop; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0); cudaGetDeviceProperties(&prop, 0);
_math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount; _math_sms = (set_sm_margin) ? prop.multiProcessorCount - num_comm_sm : prop.multiProcessorCount;
_math_sms -= transformer_engine::getenv<int>("NVTE_EXT_MARGIN_SM", 0);
_tp_size = tp_size; _tp_size = tp_size;
_aggregate2 = aggregate2; _aggregate2 = aggregate2;
......
...@@ -6,7 +6,9 @@ ...@@ -6,7 +6,9 @@
#include <torch/script.h> #include <torch/script.h>
#include "extensions.h" #include "extensions.h"
#include <cuda.h>
#include <cuda_fp8.h>
#include "common/util/system.h"
namespace { namespace {
transformer_engine::DType reverse_map_dtype(int64_t dtype) { transformer_engine::DType reverse_map_dtype(int64_t dtype) {
...@@ -316,6 +318,13 @@ at::Tensor te_gemm_ts(at::Tensor A, ...@@ -316,6 +318,13 @@ at::Tensor te_gemm_ts(at::Tensor A,
bool accumulate_arg = static_cast<bool>(accumulate); bool accumulate_arg = static_cast<bool>(accumulate);
bool use_split_accumulator_arg = static_cast<bool>(use_split_accumulator); bool use_split_accumulator_arg = static_cast<bool>(use_split_accumulator);
// Set an external SM Margin to all the GEMMs.
// This comes in handy when DP is overlapped with GEMMs
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int num_math_sms = prop.multiProcessorCount \
- transformer_engine::getenv<int>("NVTE_EXT_MARGIN_SM", 0);
if (A_scale_inverse.numel()) if (A_scale_inverse.numel())
A_scale_inverse = A_scale_inverse[A_fp8_tensor]; A_scale_inverse = A_scale_inverse[A_fp8_tensor];
...@@ -342,7 +351,7 @@ at::Tensor te_gemm_ts(at::Tensor A, ...@@ -342,7 +351,7 @@ at::Tensor te_gemm_ts(at::Tensor A,
workspaceSize_arg, workspaceSize_arg,
accumulate_arg, accumulate_arg,
use_split_accumulator_arg, use_split_accumulator_arg,
0); num_math_sms);
return D; return D;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment