"...git@developer.sourcefind.cn:kecinstone/2024-pra-vllm.git" did not exist on "cf21a9bd5cd29b8b52a8dfceac22798ef648e6bc"
Commit f8c2af4c authored by yuguo's avatar yuguo
Browse files

Merge commit '1d903f5e' of...

Merge commit '1d903f5e' of https://github.com/NVIDIA/TransformerEngine
parents e92773a3 1d903f5e
......@@ -17,7 +17,7 @@ fi
# Download Megatron-LM if needed
if [ ! -d "${MCORE_PATH}" ]; then
pushd $(dirname ${MCORE_PATH})
git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
popd
fi
......
......@@ -20,6 +20,7 @@ FAILED_CASES=""
pip3 install "nltk>=3.8.2" || error_exit "Failed to install nltk"
pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
: ${TE_PATH:=/opt/transformerengine}
: ${XML_LOG_DIR:=/logs}
mkdir -p "$XML_LOG_DIR"
......@@ -30,10 +31,9 @@ python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/py
NVTE_JAX_UNITTEST_LEVEL="L2" NVTE_CUSTOM_CALLS_RE="" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_custom_call_compute.xml $TE_PATH/tests/jax/test_custom_call_compute.py || test_fail "test_custom_call_compute.py"
pip3 install -r $TE_PATH/examples/jax/mnist/requirements.txt || error_exit "Failed to install mnist requirements"
pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Failed to install encoder requirements"
python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_mnist.xml $TE_PATH/examples/jax/mnist || test_fail "mnist"
pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Failed to install encoder requirements"
# Make encoder tests to have run-to-run deterministic to have the stable CI results
export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py"
......
......@@ -11,15 +11,17 @@ mkdir -p "$XML_LOG_DIR"
pip3 install pytest==8.2.1
# Limit parallel build jobs to avoid overwhelming system resources
export MAX_JOBS=4
export MAX_JOBS=32
# Iterate over Flash Attention versions
sm_arch=`python3 -c "import torch; sm = torch.cuda.get_device_capability(0); print(sm[0]*10+sm[1])"`
export FLASH_ATTN_CUDA_ARCHS=$sm_arch
if [ $sm_arch -gt 90 ]
then
FA_versions=(2.7.3)
else
FA_versions=(2.3.0 2.4.1 2.5.7 2.7.3 3.0.0b1)
elif [ $sm_arch -eq 90 ]
then
FA_versions=(2.5.7 2.7.3 3.0.0b1)
fi
for fa_version in "${FA_versions[@]}"
......
......@@ -7,7 +7,6 @@
# NVTE_FRAMEWORK=pytorch NVTE_USE_ROCM=1 NVTE_USE_HIPBLASLT=1 NVTE_USE_ROCBLAS=1 CMAKE_PREFIX_PATH=/opt/dtk/lib/cmake/amd_comgr/ MPI_HOME=/opt/mpi/ NVTE_UB_WITH_MPI=1 CXX=hipcc PYTHONPATH=/home/TransformerEngine/3rdparty/hipify_torch:$PYTHONPATH python3 setup.py bdist_wheel
import os
import sys
import time
from pathlib import Path
from typing import List, Tuple
......@@ -26,7 +25,6 @@ from build_tools.utils import (
get_frameworks,
install_and_import,
remove_dups,
uninstall_te_wheel_packages,
)
frameworks = get_frameworks()
......@@ -111,7 +109,15 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
"""
# Common requirements
setup_reqs: List[str] = []
setup_reqs: List[str] = [
"nvidia-cuda-runtime-cu12",
"nvidia-cublas-cu12",
"nvidia-cudnn-cu12",
"nvidia-cuda-cccl-cu12",
"nvidia-cuda-nvcc-cu12",
"nvidia-nvtx-cu12",
"nvidia-cuda-nvrtc-cu12",
]
install_reqs: List[str] = [
"pydantic",
"importlib-metadata>=1.0",
......@@ -130,6 +136,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
# Framework-specific requirements
if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
if "pytorch" in frameworks:
setup_reqs.extend(["torch>=2.1"])
install_reqs.extend(["torch>=2.1"])
# install_reqs.append(
# "nvdlfw-inspect @"
......@@ -137,8 +144,9 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
# )
# Blackwell is not supported as of Triton 3.2.0, need custom internal build
# install_reqs.append("triton")
test_reqs.extend(["numpy", "torchvision", "prettytable", "PyYAML"])
test_reqs.extend(["numpy", "torchvision"])
if "jax" in frameworks:
setup_reqs.extend(["jax[cuda12]", "flax>=0.7.1"])
install_reqs.extend(["jax", "flax>=0.7.1"])
test_reqs.extend(["numpy"])
......@@ -157,7 +165,6 @@ if __name__ == "__main__":
int(os.getenv("NVTE_RELEASE_BUILD", "0"))
), "NVTE_RELEASE_BUILD env must be set for metapackage build."
ext_modules = []
cmdclass = {}
package_data = {}
include_package_data = False
setup_requires = []
......@@ -169,15 +176,11 @@ if __name__ == "__main__":
else:
setup_requires, install_requires, test_requires = setup_requirements()
ext_modules = [setup_common_extension()]
cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}
package_data = {"": ["VERSION.txt"]}
include_package_data = True
extras_require = {"test": test_requires}
if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
# Remove residual FW packages since compiling from source
# results in a single binary with FW extensions included.
uninstall_te_wheel_packages()
if "pytorch" in frameworks:
from build_tools.pytorch import setup_pytorch_extension
......
......@@ -116,10 +116,10 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor input("input", { N, H }, itype);
Tensor output("output", { N, H }, otype);
Tensor igrad("igrad", { N, H }, itype);
Tensor ograd("ograd", { N, H }, itype);
Tensor input("input", std::vector<size_t>{ N, H }, itype);
Tensor output("output", std::vector<size_t>{ N, H }, otype);
Tensor igrad("igrad", std::vector<size_t>{ N, H }, itype);
Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);
fillUniform(&input);
fillUniform(&ograd);
......@@ -171,10 +171,10 @@ void performTestGLU(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor input("input", {N, H * 2}, itype);
Tensor output("output", {N, H}, otype);
Tensor igrad("igrad", { N, H * 2 }, itype);
Tensor ograd("ograd", { N, H }, itype);
Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
Tensor output("output", std::vector<size_t>{N, H}, otype);
Tensor igrad("igrad", std::vector<size_t>{ N, H * 2 }, itype);
Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);
fillUniform(&input);
fillUniform(&ograd);
......
......@@ -70,7 +70,7 @@ void performTest(const std::vector<size_t>& shape) {
Tensor output_c("output_c", shape, otype);
// dbias has the same data type with "output grad"
Tensor dbias("dbias", {H}, itype);
Tensor dbias("dbias", std::vector<size_t>{H}, itype);
fillUniform(&input);
setRandomScale(&output_c);
......
......@@ -79,7 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
Tensor output_c("output_c", shape, otype);
// dbias has the same data type with "output grad"
Tensor dbias("dbias", {H}, itype);
Tensor dbias("dbias", std::vector<size_t>{H}, itype);
fillUniform(&input);
fillUniform(&grad);
......
......@@ -280,7 +280,7 @@ void runTestCase(const ProcessingMethod processing_method, const std::vector<siz
Tensor grad("grad", shape, itype);
Tensor output_c("output_c", shape, otype, rowwise, colwise,
opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
Tensor output_dbias("output_dbias", {cols}, itype);
Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);
......@@ -355,7 +355,7 @@ void runTestCaseOneDimensionalBlocks(const ProcessingMethod processing_method,
Tensor grad("grad", shape, itype);
Tensor output_c("output_c", shape, otype, rowwise, colwise,
opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
Tensor output_dbias("output_dbias", {cols}, itype);
Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);
......
......@@ -230,7 +230,7 @@ void performTest_x1(const ProcessingMethod processing_method,
Tensor input("input", shape, itype);
Tensor grad("grad", shape, itype);
Tensor output_c("output_c", shape, otype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
Tensor output_dbias("output_dbias", { cols }, itype);
Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);
std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<InputType[]> ref_output_dbias = std::make_unique<InputType[]>(cols);
......@@ -368,7 +368,7 @@ void performTest_x2(const ProcessingMethod processing_method,
Tensor input("input", shape, itype);
Tensor grad("grad", shape, itype);
Tensor output("output", shape, otype, true, true, NVTE_MXFP8_1D_SCALING);
Tensor output_dbias("output_dbias", { cols }, itype);
Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);
std::unique_ptr<OutputType[]> ref_output_c_rowwise = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<OutputType[]> ref_output_c_colwise = std::make_unique<OutputType[]>(rows * cols);
......
......@@ -204,8 +204,8 @@ void performTest_x1(const size_t rows,
// std::cout << "blocks_X: " << blocks_X << std::endl;
// std::cout << "scales_stride: " << scales_stride << std::endl;
Tensor grad("grad", { rows, cols }, itype);
Tensor input("input", { rows, cols * 2 }, itype);
Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);
const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;
......@@ -289,8 +289,8 @@ void performTest_x2(const size_t rows,
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor grad("grad", { rows, cols }, itype);
Tensor input("input", { rows, cols * 2 }, itype);
Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);
const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;
......
......@@ -47,8 +47,8 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<InputType>::dtype;
DType otype = TypeInfo<OutputType>::dtype;
Tensor input("input", { N, H }, itype);
Tensor output("output", { N, H }, otype, true, true);
Tensor input("input", std::vector<size_t>{ N, H }, itype);
Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);
std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);
......
......@@ -112,8 +112,8 @@ void performTest(const size_t N, const size_t H) {
}
}
Tensor input("input", { N, H }, itype);
Tensor output("output", { N, H }, otype, true, true);
Tensor input("input", std::vector<size_t>{ N, H }, itype);
Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);
std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);
......
......@@ -65,11 +65,11 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor input("input", {N, H}, itype);
Tensor input("input", std::vector<size_t>{N, H}, itype);
Tensor output("output", {N, H}, otype, true, true);
Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
// dbias has the same data type with "output grad"
Tensor dbias("dbias", {H}, itype);
Tensor dbias("dbias", std::vector<size_t>{H}, itype);
fillUniform(&input);
setRandomScale(&output);
......
......@@ -76,12 +76,12 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor input("input", {N, H}, itype);
Tensor gelu_input("gelu_input", {N, H}, itype);
Tensor input("input", std::vector<size_t>{N, H}, itype);
Tensor gelu_input("gelu_input", std::vector<size_t>{N, H}, itype);
Tensor output("output", {N, H}, otype, true, true);
Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
// dbias has the same data type with "output grad"
Tensor dbias("dbias", {H}, itype);
Tensor dbias("dbias", std::vector<size_t>{H}, itype);
fillUniform(&input);
fillUniform(&gelu_input);
......
......@@ -74,9 +74,9 @@ void performTest(const size_t N, const size_t H) {
DType itype = TypeInfo<IType>::dtype;
DType otype = TypeInfo<OType>::dtype;
Tensor grad("grad", {N, H}, itype);
Tensor input("input", {N, H * 2}, itype);
Tensor output("output", {N, H * 2}, otype, true, true);
Tensor grad("grad", std::vector<size_t>{N, H}, itype);
Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
Tensor output("output", std::vector<size_t>{N, H * 2}, otype, true, true);
fillUniform(&grad);
fillUniform(&input);
......
......@@ -153,11 +153,11 @@ void performTest(
DType itype = TypeInfo<Type>::dtype;
Tensor data_in("data_in", { batches, heads, rows, cols }, itype);
Tensor softmax_out("softmax_out", { batches, heads, rows, cols }, itype);
Tensor softmax_in("softmax_in", { batches, heads, rows, cols }, itype);
Tensor grads_in("grads_in", { batches, heads, rows, cols }, itype);
Tensor grads_out("grads_out", { batches, heads, rows, cols }, itype);
Tensor data_in("data_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
Tensor softmax_out("softmax_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);
Tensor softmax_in("softmax_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
Tensor grads_in("grads_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
Tensor grads_out("grads_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);
const size_t elements_total = batches * heads * rows * cols;
std::unique_ptr<Type[]> softmax_out_ref = std::make_unique<Type[]>(elements_total);
......
......@@ -214,10 +214,10 @@ void performTest_x1(const size_t rows,
const size_t blocks_num = rowwise ? blocks_num_rowwise : blocks_num_colwise;
const size_t scales_stride = rowwise ? blocks_X_rowwise : blocks_X_colwise;
Tensor input("input", { rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
Tensor input("input", std::vector<size_t>{ rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor output("output", { rows, cols }, otype, true, false);
Tensor output("output", std::vector<size_t>{ rows, cols }, otype, true, false);
std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<fp8e8m0[]> scales = std::make_unique<fp8e8m0[]>(blocks_num);
......@@ -267,11 +267,11 @@ void performTest_quantize_then_dequantize(const size_t rows,
// input --> quantized --> output (dequantized)
// input == output
Tensor input("input", { rows, cols }, in_type);
Tensor quantized("quantized", { rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
Tensor input("input", std::vector<size_t>{ rows, cols }, in_type);
Tensor quantized("quantized", std::vector<size_t>{ rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor output("output", { rows, cols }, out_type, true, false);
Tensor output("output", std::vector<size_t>{ rows, cols }, out_type, true, false);
// fillCase<EncodingType>(&input, InputsFillCase::minNorm_to_maxNorm);
fillCase<EncodingType>(&input, InputsFillCase::uniform);
......@@ -333,8 +333,8 @@ void performTest_x2(const size_t rows,
const size_t blocks_num_rowwise = blocks_Y_rowwise * blocks_X_rowwise;
const size_t blocks_num_colwise = blocks_Y_colwise * blocks_X_colwise;
Tensor input("input", { rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
Tensor output("output", { rows, cols }, otype);
Tensor input("input", std::vector<size_t>{ rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
Tensor output("output", std::vector<size_t>{ rows, cols }, otype);
std::unique_ptr<OutputType[]> ref_output_rowwise = std::make_unique<OutputType[]>(rows * cols);
std::unique_ptr<OutputType[]> ref_output_colwise = std::make_unique<OutputType[]>(rows * cols);
......
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include <cmath>
#include <cstring>
#include <memory>
#include <iomanip>
#include <iostream>
#include <random>
#include <type_traits>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/transformer_engine.h>
#include "../test_common.h"
using namespace transformer_engine;
class MemsetTestSuite : public ::testing::TestWithParam<std::tuple<int,
size_t>> {};
TEST_P(MemsetTestSuite, TestMemset) {
using namespace transformer_engine;
using namespace test;
int value = std::get<0>(GetParam());
size_t size_in_bytes = std::get<1>(GetParam());
std::vector<uint8_t> h_buffer{};
h_buffer.resize(size_in_bytes);
for (size_t i = 0; i < size_in_bytes; ++i) {
h_buffer[i] = value + 1; // Initialize host buffer to a different value than memset value to verify memset is working correctly
}
char* d_ptr;
NVTE_CHECK_CUDA(cudaMalloc(&d_ptr, size_in_bytes));
NVTE_CHECK_CUDA(cudaMemcpy(d_ptr, h_buffer.data(), size_in_bytes, cudaMemcpyHostToDevice));
nvte_memset(d_ptr, value, size_in_bytes, 0 /* stream */);
NVTE_CHECK_CUDA(cudaMemcpy(
h_buffer.data(), d_ptr, size_in_bytes, cudaMemcpyDeviceToHost));
NVTE_CHECK_CUDA(cudaFree(d_ptr));
NVTE_CHECK_CUDA(cudaDeviceSynchronize());
for (size_t i = 0; i < size_in_bytes; ++i) {
EXPECT_EQ(h_buffer[i], static_cast<uint8_t>(value))
<< "Mismatch at index " << i << ": expected " << static_cast<int>(value)
<< ", got " << static_cast<int>(h_buffer[i]);
}
}
namespace {
std::vector<size_t> memset_test_sizes = {
1,
4,
9,
16,
128,
4096,
4097,
8192,
};
} // namespace
INSTANTIATE_TEST_SUITE_P(
OperatorTest,
MemsetTestSuite,
::testing::Combine(
::testing::Values(0, 6),
::testing::ValuesIn(memset_test_sizes)),
[](const testing::TestParamInfo<MemsetTestSuite::ParamType>& info) {
std::string name = std::to_string(std::get<0>(info.param)) + "X" +
std::to_string(std::get<1>(info.param));
return name;
});
......@@ -81,9 +81,9 @@ void performTest() {
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) {
const size_t height = tensor_dims[tensor_id].first;
const size_t width = tensor_dims[tensor_id].second;
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id),
{ height, width }, otype, true, true));
std::vector<size_t>{ height, width }, otype, true, true));
auto& input = input_list.back();
auto& output = output_list.back();
......
......@@ -85,8 +85,8 @@ void performTest() {
const size_t height = tensor_dims[tensor_id].first;
const size_t width = tensor_dims[tensor_id].second;
const size_t padded_height = (height + align - 1) / align * align;
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), { padded_height, width }, otype));
input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), std::vector<size_t>{ padded_height, width }, otype));
auto& input = input_list.back();
auto& output = output_list.back();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment