Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b4522474
Unverified
Commit
b4522474
authored
Sep 25, 2024
by
sasha0552
Committed by
GitHub
Sep 24, 2024
Browse files
[Bugfix][Kernel] Implement acquire/release polyfill for Pascal (#8776)
parent
ee777d9c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
0 deletions
+18
-0
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce.cuh
+11
-0
csrc/custom_all_reduce_test.cu
csrc/custom_all_reduce_test.cu
+7
-0
No files found.
csrc/custom_all_reduce.cuh
View file @
b4522474
...
@@ -131,15 +131,26 @@ DINLINE O downcast(array_t<float, O::size> val) {
...
@@ -131,15 +131,26 @@ DINLINE O downcast(array_t<float, O::size> val) {
}
}
static
DINLINE
void
st_flag_release
(
FlagType
*
flag_addr
,
FlagType
flag
)
{
static
DINLINE
void
st_flag_release
(
FlagType
*
flag_addr
,
FlagType
flag
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm
volatile
(
"st.release.sys.global.u32 [%1], %0;"
::
"r"
(
flag
),
asm
volatile
(
"st.release.sys.global.u32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
"l"
(
flag_addr
));
#else
asm
volatile
(
"membar.sys; st.volatile.global.u32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
#endif
}
}
static
DINLINE
FlagType
ld_flag_acquire
(
FlagType
*
flag_addr
)
{
static
DINLINE
FlagType
ld_flag_acquire
(
FlagType
*
flag_addr
)
{
FlagType
flag
;
FlagType
flag
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm
volatile
(
"ld.acquire.sys.global.u32 %0, [%1];"
asm
volatile
(
"ld.acquire.sys.global.u32 %0, [%1];"
:
"=r"
(
flag
)
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
:
"l"
(
flag_addr
));
#else
asm
volatile
(
"ld.volatile.global.u32 %0, [%1]; membar.gl;"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
#endif
return
flag
;
return
flag
;
}
}
...
...
csrc/custom_all_reduce_test.cu
View file @
b4522474
...
@@ -44,7 +44,14 @@
...
@@ -44,7 +44,14 @@
} while (0)
} while (0)
__global__
void
dummy_kernel
()
{
__global__
void
dummy_kernel
()
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for
(
int
i
=
0
;
i
<
100
;
i
++
)
__nanosleep
(
1000000
);
// 100ms
for
(
int
i
=
0
;
i
<
100
;
i
++
)
__nanosleep
(
1000000
);
// 100ms
#else
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
long
long
int
start
=
clock64
();
while
(
clock64
()
-
start
<
150000000
);
// approximately 98.4ms on P40
}
#endif
}
}
template
<
typename
T
>
template
<
typename
T
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment