Unverified Commit 13bd745b authored by vasunvidia's avatar vasunvidia Committed by GitHub
Browse files

Remove cudaStreamSync. call from transformer_engine.cpp (#1518)



* Remove cudaStreamSync. call
Signed-off-by: default avatarVasudevan Rengasamy <vrengasamy@nvidia.com>

* Use cudaMemsetAsync instead of cudaMemcpyAsync
Signed-off-by: default avatarVasudevan Rengasamy <vrengasamy@nvidia.com>

* Update transformer_engine/common/transformer_engine.cpp
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

---------
Signed-off-by: default avatarVasudevan Rengasamy <vrengasamy@nvidia.com>
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
parent 97100139
......@@ -407,8 +407,6 @@ void nvte_zero_tensor(const NVTETensor tensor, cudaStream_t stream) {
}
// Set amax to 0 if allocated
if (t.amax.dptr != nullptr) {
float zero = 0.0f;
cudaMemcpyAsync(t.amax.dptr, &zero, sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(t.amax.dptr, 0, sizeof(float), stream);
}
cudaStreamSynchronize(stream);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment