Commit 241cdb78 authored by Rick Ho's avatar Rick Ho
Browse files

remove redundant files

parent c5cfd5fb
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(moe)
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
if(NOT PYTHON_INCLUDE)
set(PYTHON_INCLUDE "/home/jiezhong/miniconda3/include/python3.8")
endif()
if(NOT CUDA_HOME)
set(CUDA_HOME "/usr/local/cuda")
endif()
if(NOT CUDA_SAMPLE_INCLUDE)
set(CUDA_SAMPLE_INCLUDE "/usr/local/cuda/samples/common/inc")
endif()
include_directories(
"${PYTHON_INCLUDE}"
"${CUDA_HOME}/include"
"${CUDA_SAMPLE_INCLUDE}"
)
add_executable(moe moe.cpp cuda_stream_manager.cpp)
target_link_libraries(moe
"${TORCH_LIBRARIES}")
set_property(TARGET moe PROPERTY CXX_STANDARD 14)
# The following code block is suggested to be used on Windows.
# According to https://github.com/pytorch/pytorch/issues/25457,
# the DLLs need to be copied to avoid memory errors.
if (MSVC)
file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
add_custom_command(TARGET moe
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${TORCH_DLLS}
$<TARGET_FILE_DIR:moe>)
endif (MSVC)
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdio>
using namespace std;
typedef float data_t;
cudaStream_t st = 0;
__global__
void generate_ptr_sequential_kernel(int n, data_t* base, size_t stride, data_t** ptrs) {
size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ptrs[idx] = base + stride * idx;
}
}
__global__
void generate_ptr_offset_kernel(int n, data_t* base, size_t stride, int* offset, data_t** ptrs) {
size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ptrs[idx] = base + stride * offset[idx];
}
}
#define CEIL(_x_,_y_) (((_x_)-1)/(_y_)+1)
data_t** generate_ptr(int n, data_t* base, size_t stride, int* d_offset = 0) {
dim3 griddim(CEIL(n, 256));
dim3 blockdim(256);
data_t** ptrs;
cudaMalloc(&ptrs , n * sizeof(data_t*));
if (d_offset) {
generate_ptr_offset_kernel<<<griddim, blockdim, 0, st>>>(n, base, stride, d_offset, ptrs);
} else {
generate_ptr_sequential_kernel<<<griddim, blockdim, 0, st>>>(n, base, stride, ptrs);
}
cudaError_t err = cudaPeekAtLastError();
if (err) {
std::cerr << "CUDA" << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl;
}
cudaStreamSynchronize(st);
return ptrs;
}
int main() {
cudaStreamCreate(&st);
int n = 128;
int offset[128], *d_offset;
float* base = (float*)0x10, **d_res, **res;
for (int i = 0; i < n; ++i) {
offset[i] = rand() % n;
}
cudaMalloc(&d_offset, n * sizeof(int));
cudaMemcpy(d_offset, offset, n * sizeof(int), cudaMemcpyHostToDevice);
d_res = generate_ptr(n, base, 0x100);
res = new float*[n];
cudaMemcpy(res, d_res, n * sizeof(float*), cudaMemcpyDeviceToHost);
puts("Sequential addr check");
for (int i = 0; i < 10; ++i) {
printf("%08x ", (unsigned long)res[i]);
}
putchar(10);
d_res = generate_ptr(n, base, 0x400, d_offset);
res = new float*[n];
cudaMemcpy(res, d_res, n * sizeof(float*), cudaMemcpyDeviceToHost);
puts("Sequential addr check");
for (int i = 0; i < 10; ++i) {
printf("%08x /%08x\n", (unsigned long)res[i], offset[i]);
}
putchar(10);
}
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdio>
#include "timer.hh"
static const char *geterr(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#define cublas_safe_call(__fn__) { \
cublasStatus_t res = __fn__; \
if (res != CUBLAS_STATUS_SUCCESS) { \
std::cerr << "Cublas " << geterr(res) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
} \
}
#define cuda_safe_call(__fn__) { \
auto res = __fn__; \
if (res) { \
std::cerr << "CUDA" << cudaGetErrorString(res) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
} \
}
using namespace std;
typedef float data_t;
int d_batch = 4096;
int d_matx = 1;
int d_experts = 128;
int d_in = 1024;
int d_hidden = 4096;
int d_out = 1024;
data_t *featin, *feath, *weight1, *weight2, *featout;
int *offset;
cublasHandle_t hdl;
cudaStream_t st;
void prepare() {
cudaStreamCreate(&st);
cublasCreate(&hdl);
cublasSetStream(hdl, st);
}
void compute() {
vector<data_t*> aptrs, bptrs, cptrs;
float **ptrs;
cudaMalloc(&ptrs, d_batch * sizeof(float*) * 3);
for (int i = 0; i < d_batch; ++i) {
aptrs.push_back(featin + 1 * d_in * i);
bptrs.push_back(weight1 + d_hidden * d_in * offset[i]);
cptrs.push_back(feath + d_hidden * i);
}
cudaMemcpy(ptrs, aptrs.data(), d_batch * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(ptrs + d_batch, bptrs.data(), d_batch * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(ptrs + d_batch * 2, cptrs.data(), d_batch * sizeof(float*),
cudaMemcpyHostToDevice);
data_t alpha = 1, beta = 0;
cublas_safe_call(cublasSgemmBatched(hdl,
CUBLAS_OP_T,
CUBLAS_OP_T,
d_matx, d_hidden, d_in,
&alpha,
ptrs, d_in,
ptrs + d_batch, d_hidden,
&beta,
ptrs + d_batch * 2, d_matx,
d_batch));
cudaStreamSynchronize(st);
// cudaDeviceSynchronize();
}
int main() {
cuda_safe_call(cudaMalloc(&weight1, d_in * d_hidden * d_experts * sizeof(data_t)));
cudaMalloc(&weight2, d_out * d_hidden * d_experts * sizeof(data_t));
cudaMalloc(&featin, d_batch * d_matx * d_in * sizeof(data_t));
cudaMalloc(&feath, d_batch * d_matx * d_hidden * sizeof(data_t));
cudaMalloc(&featout, d_batch * d_matx * d_out * sizeof(data_t));
prepare();
double tsum = 0, tmax = 0;
int nt = 16;
offset = new int[d_batch];
for (int i = 0; i < d_batch; ++i) {
offset[i] = rand() % d_experts;
}
compute();
for (int i = 0; i < nt; ++i) {
for (int j = 0; j < d_batch; ++j) {
offset[j] = rand() % d_experts;
}
timestamp(start);
compute();
timestamp(end);
auto t = getDuration(start, end);
tsum += t;
if (t > tmax) tmax = t;
}
printf("Mean %.3lf us, max %.3lf us\n", tsum / nt * 1e6, tmax * 1e6);
double tflops = (double)d_batch * d_matx * d_in * (double)d_hidden * nt * 2e-12 / tsum;
printf("%.3lf TFLOPs\n", tflops);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment