Commit a6db9526 authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

update

parent ca3ece2c
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
data/ data/
libtorch-shared-with-deps-* libtorch-shared-with-deps-*
pytorch/cuda/build pytorch/cuda/build
exp/ exp/
\ No newline at end of file .vscode/
\ No newline at end of file
...@@ -4,8 +4,8 @@ project(moe) ...@@ -4,8 +4,8 @@ project(moe)
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
include_directories("/home/jiezhong/anaconda3/envs/torch/include/python3.6m" include_directories("/home/jiezhong/miniconda3/include/python3.8"
"/usr/local/cuda-/include" "/usr/local/cuda/include"
"/usr/local/cuda/samples/common/inc") "/usr/local/cuda/samples/common/inc")
add_executable(moe moe.cpp) add_executable(moe moe.cpp)
target_link_libraries(moe target_link_libraries(moe
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include <helper_cuda.h> #include <helper_cuda.h>
const int num_stream=16; const int num_stream=512;
// std::vector<torch::Tensor> // std::vector<torch::Tensor>
void moe_cuda_forward( void moe_cuda_forward(
...@@ -27,8 +27,8 @@ void moe_cuda_forward( ...@@ -27,8 +27,8 @@ void moe_cuda_forward(
const auto num_expert = gate.size(1); const auto num_expert = gate.size(1);
const auto d_model = weight.size(1); const auto d_model = weight.size(1);
const auto d_ffn = weight.size(2); const auto d_ffn = weight.size(2);
printf("b=%d, expert=%d, d_model=%d, d_ffn=%d\n", batch_size, num_expert, d_model, d_ffn);
auto output = input.new_zeros({batch_size, num_expert, d_ffn}); auto output = input.new_zeros({batch_size, num_expert, d_ffn});
std::cout << output << std::endl;
cublasHandle_t handle; cublasHandle_t handle;
...@@ -39,50 +39,66 @@ void moe_cuda_forward( ...@@ -39,50 +39,66 @@ void moe_cuda_forward(
checkCudaErrors(cudaStreamCreate(&stream[i])); checkCudaErrors(cudaStreamCreate(&stream[i]));
} }
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
size_t s; size_t s;
for (size_t i=0; i<batch_size; ++i) { for (size_t i=0; i<batch_size; ++i) {
for (size_t j=0; j<num_expert; ++j) { for (size_t j=0; j<num_expert; ++j) {
s = (i * num_expert + j) % num_stream; s = (i * num_expert + j) % num_stream;
printf("i=%d j=%d goes to stream %d\n", i, j, s); // printf("i=%d j=%d goes to stream %d\n", i, j, s);
cublasSetStream(handle, stream[s]); checkCudaErrors(cublasSetStream(handle, stream[s]));
if (input.scalar_type() == torch::ScalarType::Float) { if (input.scalar_type() == torch::ScalarType::Float) {
float alpha = 1.0; float alpha = 1.0;
float beta = 0.0; float beta = 0.0;
std::cout << input[i] << std::endl; checkCudaErrors(cublasSgemm(handle,
std::cout << weight.index(gate[i][j]) << std::endl;
std::cout << output[i][j] << std::endl;
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
CUBLAS_OP_N, CUBLAS_OP_N,
1, // m 1, // m
d_ffn, // n d_ffn, // n
d_model, // k d_model, // k
&alpha, &alpha,
input.data_ptr<float>() + i * d_model, input[i].data_ptr<float>(),
// input[i].data_ptr<float>(),
1, 1,
weight.index(gate[i][j]).data_ptr<float>(), weight.index(gate[i][j]).data_ptr<float>(),
d_model, d_model,
&beta, &beta,
output.data_ptr<float>() + i * num_expert * d_ffn + j * d_ffn, output[i][j].data_ptr<float>(),
1); 1));
} else { } else {
printf("only support float!!!\n"); printf("only support float!!!\n");
} }
} }
} }
cudaDeviceSynchronize(); // checkCudaErrors(cudaDeviceSynchronize());
printf("synchronized\n"); // Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / batch_size / num_expert;
double flopsPerMatrixMul = 2.0 * (double)d_model * (double)d_ffn;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul);
// std::cout << output << std::endl;
for (size_t i=0; i<num_stream; ++i) { for (size_t i=0; i<num_stream; ++i) {
cudaStreamDestroy(stream[i]); checkCudaErrors(cudaStreamDestroy(stream[i]));
} }
std::cout << output << std::endl; checkCudaErrors(cublasDestroy(handle));
cublasDestroy(handle);
} }
...@@ -96,10 +112,10 @@ void moe_cuda_forward( ...@@ -96,10 +112,10 @@ void moe_cuda_forward(
int main() { int main() {
int device=2; int device=2;
torch::Tensor input = torch::randn({2, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device)); torch::Tensor input = torch::randn({2048, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
torch::Tensor gate = torch::zeros({2, 1}, torch::dtype(torch::kInt64).device(torch::kCUDA, device)); torch::Tensor gate = torch::zeros({2048, 2}, torch::dtype(torch::kInt64).device(torch::kCUDA, device));
torch::Tensor weight = torch::randn({2, 4, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device)); torch::Tensor weight = torch::randn({2, 512, 2048}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
torch::Tensor bias = torch::randn({2, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device)); torch::Tensor bias = torch::randn({2, 2048}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
std::cout << input << std::endl; checkCudaErrors(cudaSetDevice(device));
moe_cuda_forward(input, gate, weight, bias); moe_cuda_forward(input, gate, weight, bias);
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment