Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
241cdb78
Commit
241cdb78
authored
Jan 25, 2021
by
Rick Ho
Browse files
remove redundant files
parent
c5cfd5fb
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
0 additions
and
258 deletions
+0
-258
cuda/CMakeLists.txt
cuda/CMakeLists.txt
+0
-39
cuda/offset_generator.cu
cuda/offset_generator.cu
+0
-79
cuda/sgemm_batched_example.cu
cuda/sgemm_batched_example.cu
+0
-140
No files found.
cuda/CMakeLists.txt
deleted
100644 → 0
View file @
c5cfd5fb
cmake_minimum_required
(
VERSION 3.0 FATAL_ERROR
)
project
(
moe
)
find_package
(
Torch REQUIRED
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
TORCH_CXX_FLAGS
}
"
)
if
(
NOT PYTHON_INCLUDE
)
set
(
PYTHON_INCLUDE
"/home/jiezhong/miniconda3/include/python3.8"
)
endif
()
if
(
NOT CUDA_HOME
)
set
(
CUDA_HOME
"/usr/local/cuda"
)
endif
()
if
(
NOT CUDA_SAMPLE_INCLUDE
)
set
(
CUDA_SAMPLE_INCLUDE
"/usr/local/cuda/samples/common/inc"
)
endif
()
include_directories
(
"
${
PYTHON_INCLUDE
}
"
"
${
CUDA_HOME
}
/include"
"
${
CUDA_SAMPLE_INCLUDE
}
"
)
add_executable
(
moe moe.cpp cuda_stream_manager.cpp
)
target_link_libraries
(
moe
"
${
TORCH_LIBRARIES
}
"
)
set_property
(
TARGET moe PROPERTY CXX_STANDARD 14
)
# The following code block is suggested to be used on Windows.
# According to https://github.com/pytorch/pytorch/issues/25457,
# the DLLs need to be copied to avoid memory errors.
if
(
MSVC
)
file
(
GLOB TORCH_DLLS
"
${
TORCH_INSTALL_PREFIX
}
/lib/*.dll"
)
add_custom_command
(
TARGET moe
POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E copy_if_different
${
TORCH_DLLS
}
$<TARGET_FILE_DIR:moe>
)
endif
(
MSVC
)
cuda/offset_generator.cu
deleted
100644 → 0
View file @
c5cfd5fb
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdio>
using
namespace
std
;
typedef
float
data_t
;
cudaStream_t
st
=
0
;
__global__
void
generate_ptr_sequential_kernel
(
int
n
,
data_t
*
base
,
size_t
stride
,
data_t
**
ptrs
)
{
size_t
idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
if
(
idx
<
n
)
{
ptrs
[
idx
]
=
base
+
stride
*
idx
;
}
}
__global__
void
generate_ptr_offset_kernel
(
int
n
,
data_t
*
base
,
size_t
stride
,
int
*
offset
,
data_t
**
ptrs
)
{
size_t
idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
if
(
idx
<
n
)
{
ptrs
[
idx
]
=
base
+
stride
*
offset
[
idx
];
}
}
#define CEIL(_x_,_y_) (((_x_)-1)/(_y_)+1)
data_t
**
generate_ptr
(
int
n
,
data_t
*
base
,
size_t
stride
,
int
*
d_offset
=
0
)
{
dim3
griddim
(
CEIL
(
n
,
256
));
dim3
blockdim
(
256
);
data_t
**
ptrs
;
cudaMalloc
(
&
ptrs
,
n
*
sizeof
(
data_t
*
));
if
(
d_offset
)
{
generate_ptr_offset_kernel
<<<
griddim
,
blockdim
,
0
,
st
>>>
(
n
,
base
,
stride
,
d_offset
,
ptrs
);
}
else
{
generate_ptr_sequential_kernel
<<<
griddim
,
blockdim
,
0
,
st
>>>
(
n
,
base
,
stride
,
ptrs
);
}
cudaError_t
err
=
cudaPeekAtLastError
();
if
(
err
)
{
std
::
cerr
<<
"CUDA"
<<
cudaGetErrorString
(
err
)
<<
" at "
<<
__FILE__
<<
":"
<<
__LINE__
<<
std
::
endl
;
}
cudaStreamSynchronize
(
st
);
return
ptrs
;
}
int
main
()
{
cudaStreamCreate
(
&
st
);
int
n
=
128
;
int
offset
[
128
],
*
d_offset
;
float
*
base
=
(
float
*
)
0x10
,
**
d_res
,
**
res
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
offset
[
i
]
=
rand
()
%
n
;
}
cudaMalloc
(
&
d_offset
,
n
*
sizeof
(
int
));
cudaMemcpy
(
d_offset
,
offset
,
n
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
d_res
=
generate_ptr
(
n
,
base
,
0x100
);
res
=
new
float
*
[
n
];
cudaMemcpy
(
res
,
d_res
,
n
*
sizeof
(
float
*
),
cudaMemcpyDeviceToHost
);
puts
(
"Sequential addr check"
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
printf
(
"%08x "
,
(
unsigned
long
)
res
[
i
]);
}
putchar
(
10
);
d_res
=
generate_ptr
(
n
,
base
,
0x400
,
d_offset
);
res
=
new
float
*
[
n
];
cudaMemcpy
(
res
,
d_res
,
n
*
sizeof
(
float
*
),
cudaMemcpyDeviceToHost
);
puts
(
"Sequential addr check"
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
printf
(
"%08x /%08x
\n
"
,
(
unsigned
long
)
res
[
i
],
offset
[
i
]);
}
putchar
(
10
);
}
cuda/sgemm_batched_example.cu
deleted
100644 → 0
View file @
c5cfd5fb
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cstdio>
#include "timer.hh"
static
const
char
*
geterr
(
cublasStatus_t
error
)
{
switch
(
error
)
{
case
CUBLAS_STATUS_SUCCESS
:
return
"CUBLAS_STATUS_SUCCESS"
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
return
"CUBLAS_STATUS_NOT_INITIALIZED"
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
return
"CUBLAS_STATUS_ALLOC_FAILED"
;
case
CUBLAS_STATUS_INVALID_VALUE
:
return
"CUBLAS_STATUS_INVALID_VALUE"
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
return
"CUBLAS_STATUS_ARCH_MISMATCH"
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
return
"CUBLAS_STATUS_MAPPING_ERROR"
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
return
"CUBLAS_STATUS_EXECUTION_FAILED"
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
return
"CUBLAS_STATUS_INTERNAL_ERROR"
;
}
return
"<unknown>"
;
}
#define cublas_safe_call(__fn__) { \
cublasStatus_t res = __fn__; \
if (res != CUBLAS_STATUS_SUCCESS) { \
std::cerr << "Cublas " << geterr(res) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
} \
}
#define cuda_safe_call(__fn__) { \
auto res = __fn__; \
if (res) { \
std::cerr << "CUDA" << cudaGetErrorString(res) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
} \
}
using
namespace
std
;
typedef
float
data_t
;
int
d_batch
=
4096
;
int
d_matx
=
1
;
int
d_experts
=
128
;
int
d_in
=
1024
;
int
d_hidden
=
4096
;
int
d_out
=
1024
;
data_t
*
featin
,
*
feath
,
*
weight1
,
*
weight2
,
*
featout
;
int
*
offset
;
cublasHandle_t
hdl
;
cudaStream_t
st
;
void
prepare
()
{
cudaStreamCreate
(
&
st
);
cublasCreate
(
&
hdl
);
cublasSetStream
(
hdl
,
st
);
}
void
compute
()
{
vector
<
data_t
*>
aptrs
,
bptrs
,
cptrs
;
float
**
ptrs
;
cudaMalloc
(
&
ptrs
,
d_batch
*
sizeof
(
float
*
)
*
3
);
for
(
int
i
=
0
;
i
<
d_batch
;
++
i
)
{
aptrs
.
push_back
(
featin
+
1
*
d_in
*
i
);
bptrs
.
push_back
(
weight1
+
d_hidden
*
d_in
*
offset
[
i
]);
cptrs
.
push_back
(
feath
+
d_hidden
*
i
);
}
cudaMemcpy
(
ptrs
,
aptrs
.
data
(),
d_batch
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
ptrs
+
d_batch
,
bptrs
.
data
(),
d_batch
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
ptrs
+
d_batch
*
2
,
cptrs
.
data
(),
d_batch
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
data_t
alpha
=
1
,
beta
=
0
;
cublas_safe_call
(
cublasSgemmBatched
(
hdl
,
CUBLAS_OP_T
,
CUBLAS_OP_T
,
d_matx
,
d_hidden
,
d_in
,
&
alpha
,
ptrs
,
d_in
,
ptrs
+
d_batch
,
d_hidden
,
&
beta
,
ptrs
+
d_batch
*
2
,
d_matx
,
d_batch
));
cudaStreamSynchronize
(
st
);
// cudaDeviceSynchronize();
}
int
main
()
{
cuda_safe_call
(
cudaMalloc
(
&
weight1
,
d_in
*
d_hidden
*
d_experts
*
sizeof
(
data_t
)));
cudaMalloc
(
&
weight2
,
d_out
*
d_hidden
*
d_experts
*
sizeof
(
data_t
));
cudaMalloc
(
&
featin
,
d_batch
*
d_matx
*
d_in
*
sizeof
(
data_t
));
cudaMalloc
(
&
feath
,
d_batch
*
d_matx
*
d_hidden
*
sizeof
(
data_t
));
cudaMalloc
(
&
featout
,
d_batch
*
d_matx
*
d_out
*
sizeof
(
data_t
));
prepare
();
double
tsum
=
0
,
tmax
=
0
;
int
nt
=
16
;
offset
=
new
int
[
d_batch
];
for
(
int
i
=
0
;
i
<
d_batch
;
++
i
)
{
offset
[
i
]
=
rand
()
%
d_experts
;
}
compute
();
for
(
int
i
=
0
;
i
<
nt
;
++
i
)
{
for
(
int
j
=
0
;
j
<
d_batch
;
++
j
)
{
offset
[
j
]
=
rand
()
%
d_experts
;
}
timestamp
(
start
);
compute
();
timestamp
(
end
);
auto
t
=
getDuration
(
start
,
end
);
tsum
+=
t
;
if
(
t
>
tmax
)
tmax
=
t
;
}
printf
(
"Mean %.3lf us, max %.3lf us
\n
"
,
tsum
/
nt
*
1e6
,
tmax
*
1e6
);
double
tflops
=
(
double
)
d_batch
*
d_matx
*
d_in
*
(
double
)
d_hidden
*
nt
*
2e-12
/
tsum
;
printf
(
"%.3lf TFLOPs
\n
"
,
tflops
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment