Commit 79ccb7b6 authored by Rick Ho's avatar Rick Ho
Browse files

fix pytorch header compilation bug

parent 15f98a10
......@@ -4,43 +4,11 @@
#include <thread>
#include <iostream>
#ifdef MOE_USE_NCCL
#include <c10d/ProcessGroupNCCL.hpp>
#endif // MOE_USE_NCCL
#include "cuda_stream_manager.h"
#include <helper_cuda.h>
#define SMGR_N_STREAMS 16
#ifdef MOE_USE_NCCL
class HackNCCLGroup: public c10d::ProcessGroupNCCL {
public:
ncclComm_t getcomm(at::Device dev) {
auto key = std::to_string(dev.index());
auto v = getNCCLComm(key, {dev}, c10d::OpType::ALLTOALL);
if (v.size() == 0) {
std::cerr << "PyTorch has nothing\n";
return 0;
}
return v[0]->getNcclComm();
}
};
void CudaStreamManager::ensure(void* torchp, at::Device dev) {
if (this->ncclgood) {
return;
}
HackNCCLGroup* h = (HackNCCLGroup*)torchp;
this->ncclcomm = h->getcomm(dev);
if (this->ncclcomm != 0) {
this->ncclgood = 1;
} else {
std::cerr << "Nccl initialization failed\n";
}
}
#endif // MOE_USE_NCCL
cudaStream_t CudaStreamManager::stream(size_t idx) {
return this->streams[idx % SMGR_N_STREAMS];
}
......
......@@ -25,7 +25,6 @@ public:
#ifdef MOE_USE_NCCL
char ncclgood;
ncclComm_t ncclcomm;
void ensure(void*, class at::Device);
#endif
public:
......
......@@ -109,18 +109,37 @@ std::vector<torch::Tensor> moe_global_fused_forward(
global_batch_size, local_batch_size, n_workers);
}
#endif
/*
int main() {
int device=2;
torch::Tensor input = torch::randn({2048, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
torch::Tensor gate = torch::zeros({2048, 2}, torch::dtype(torch::kInt64));
torch::Tensor weight = torch::randn({2, 512, 2048}, torch::dtype(torch::kFloat32).device(torch::kCUDA, device));
checkCudaErrors(cudaSetDevice(device));
moe_cuda_forward(input, gate, weight);
#include <c10d/ProcessGroupNCCL.hpp>
#include "cuda_stream_manager.h"
class HackNCCLGroup: public c10d::ProcessGroupNCCL {
public:
ncclComm_t getcomm(at::Device dev) {
auto key = std::to_string(dev.index());
auto v = getNCCLComm(key, {dev}, c10d::OpType::ALLTOALL);
if (v.size() == 0) {
std::cerr << "PyTorch has nothing\n";
return 0;
}
return v[0]->getNcclComm();
}
};
void moe_ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t) {
auto smgr = getCudaStreamManager(t.device().index());
if (smgr->ncclgood) {
return;
}
HackNCCLGroup* h = (HackNCCLGroup*)(void*)&p;
smgr->ncclcomm = h->getcomm(t.device());
if (smgr->ncclcomm != 0) {
smgr->ncclgood = 1;
} else {
std::cerr << "Nccl initialization failed\n";
}
}
*/
#endif // MOE_USE_NCCL
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("expert_count", &moe_expert_count, "MoE expert count (CUDA)");
......
......@@ -195,9 +195,4 @@ std::vector<torch::Tensor> moe_cuda_global_gather(
return {local_output_buf,};
}
void moe_ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t) {
auto smgr = getCudaStreamManager(t.device().index());
smgr->ensure((void*)&p, t.device());
}
#endif
......@@ -350,46 +350,3 @@ std::vector<torch::Tensor> moe_cuda_backward(
return {grad_input_buf, grad_weight};
}
/*
int main() {
typedef float data_t;
size_t batch_size = 4096;
size_t top_k = 2;
size_t num_expert = 128;
size_t in_feat = 1024;
size_t out_feat = 4096;
data_t *input, *weight;
data_t *output;
size_t *gate;
checkCudaErrors(cudaMalloc(&input, batch_size * in_feat * sizeof(data_t)));
checkCudaErrors(cudaMalloc(&weight, num_expert * in_feat * out_feat * sizeof(data_t)));
checkCudaErrors(cudaMalloc(&output, batch_size * top_k * out_feat * sizeof(data_t)));
checkCudaErrors(cudaMalloc(&gate, batch_size * top_k * sizeof(size_t)));
size_t nt = 16;
double tsum = 0, tmax = 0;
size_t *gate_host = new size_t[batch_size * top_k];
for (size_t i=0; i<batch_size * top_k; ++i) {
gate_host[i] = rand() % num_expert;
}
checkCudaErrors(cudaMemcpy(gate, gate_host, batch_size * top_k * sizeof(size_t), cudaMemcpyHostToDevice));
moe_first_linear_cuda_forward<data_t>(input, gate, weight, output, batch_size, top_k, in_feat, out_feat);
for (size_t i=0; i<nt; ++i) {
timestamp(start);
moe_first_linear_cuda_forward<data_t>(input, gate, weight, output, batch_size, top_k, in_feat, out_feat);
timestamp(end);
auto t = getDuration(start, end);
tsum += t;
if (t > tmax) tmax = t;
}
printf("Mean %.3lf us, max %.3lf us\n", tsum / nt * 1e6, tmax * 1e6);
double tflops = (double)batch_size * top_k * in_feat * out_feat * nt * 2e-12 / tsum;
printf("%.3lf TFLOPs\n", tflops);
}
*/
......@@ -41,9 +41,6 @@ std::vector<torch::Tensor> moe_cuda_global_gather(
torch::Tensor global_expert_count,
long batch_size, long n_workers);
#include <c10d/ProcessGroupNCCL.hpp>
void moe_ensure_nccl(c10d::ProcessGroupNCCL&, torch::Tensor t);
std::vector<torch::Tensor> moe_cuda_expert_exchange(
torch::Tensor local_expert_count,
long num_expert, long n_workers);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment