Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c6703d1e
Unverified
Commit
c6703d1e
authored
Jun 16, 2025
by
Lu Fang
Committed by
GitHub
Jun 15, 2025
Browse files
[MISC] Remove unused variableds in C++ (#19609)
Signed-off-by:
Lu Fang
<
lufang@fb.com
>
parent
a5e7242d
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2 additions
and
39 deletions
+2
-39
csrc/attention/paged_attention_v1.cu
csrc/attention/paged_attention_v1.cu
+1
-4
csrc/attention/paged_attention_v2.cu
csrc/attention/paged_attention_v2.cu
+1
-4
csrc/prepare_inputs/advance_step.cu
csrc/prepare_inputs/advance_step.cu
+0
-1
csrc/quantization/fp8/amd/quant_utils.cuh
csrc/quantization/fp8/amd/quant_utils.cuh
+0
-2
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/q_gemm.cu
+0
-8
csrc/rocm/attention.cu
csrc/rocm/attention.cu
+0
-20
No files found.
csrc/attention/paged_attention_v1.cu
View file @
c6703d1e
...
...
@@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
int
kv_block_stride
=
key_cache
.
stride
(
0
);
int
kv_head_stride
=
key_cache
.
stride
(
1
);
[[
maybe_unused
]]
int
thread_group_size
=
MAX
(
WARP_SIZE
/
BLOCK_SIZE
,
1
);
assert
(
head_size
%
thread_group_size
==
0
);
// NOTE: alibi_slopes is optional.
const
float
*
alibi_slopes_ptr
=
alibi_slopes
...
...
@@ -193,4 +190,4 @@ void paged_attention_v1(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
\ No newline at end of file
#undef DIVIDE_ROUND_UP
csrc/attention/paged_attention_v2.cu
View file @
c6703d1e
...
...
@@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
int
kv_block_stride
=
key_cache
.
stride
(
0
);
int
kv_head_stride
=
key_cache
.
stride
(
1
);
[[
maybe_unused
]]
int
thread_group_size
=
MAX
(
WARP_SIZE
/
BLOCK_SIZE
,
1
);
assert
(
head_size
%
thread_group_size
==
0
);
// NOTE: alibi_slopes is optional.
const
float
*
alibi_slopes_ptr
=
alibi_slopes
...
...
@@ -203,4 +200,4 @@ void paged_attention_v2(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
\ No newline at end of file
#undef DIVIDE_ROUND_UP
csrc/prepare_inputs/advance_step.cu
View file @
c6703d1e
...
...
@@ -274,7 +274,6 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute
(
&
blocks
,
cudaDevAttrMultiProcessorCount
,
dev
);
cudaDeviceGetAttribute
(
&
threads
,
cudaDevAttrMaxThreadsPerBlock
,
dev
);
[[
maybe_unused
]]
int
block_tables_stride
=
block_tables
.
stride
(
0
);
TORCH_CHECK
((
blocks
*
threads
>
num_queries
),
"multi-step: not enough threads to map to num_queries = "
,
num_queries
,
" block_tables.stride(0) = "
,
block_tables
.
stride
(
0
),
...
...
csrc/quantization/fp8/amd/quant_utils.cuh
View file @
c6703d1e
...
...
@@ -446,8 +446,6 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
template
<
>
__inline__
__device__
uint32_t
scaled_vec_conversion
<
uint32_t
,
uint16_t
>
(
const
uint16_t
&
a
,
float
scale
)
{
[[
maybe_unused
]]
__half2_raw
h2r
=
__hip_cvt_fp8x2_to_halfraw2
(
a
,
fp8_type
::
__default_interpret
);
union
{
__half2_raw
h2r
;
uint32_t
ui32
;
...
...
csrc/quantization/gptq/q_gemm.cu
View file @
c6703d1e
...
...
@@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
auto
offset_m
=
blockIdx
.
y
*
m_count
;
auto
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
auto
offset_m
=
blockIdx
.
y
*
m_count
;
auto
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
auto
offset_m
=
blockIdx
.
y
*
m_count
;
auto
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
auto
offset_m
=
blockIdx
.
y
*
m_count
;
auto
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
csrc/rocm/attention.cu
View file @
c6703d1e
...
...
@@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {
template
<
typename
T
>
__device__
__forceinline__
_B16x4
from_floatx4
(
const
floatx4
&
inp
)
{
[[
maybe_unused
]]
union
tmpcvt
{
uint16_t
u
;
_Float16
f
;
__hip_bfloat16
b
;
}
t16
;
_B16x4
ret
;
if
constexpr
(
std
::
is_same
<
T
,
_Float16
>::
value
)
{
union
h2cvt
{
...
...
@@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template
<
typename
T
>
__device__
__forceinline__
_B16x4
addx4
(
const
_B16x4
&
inp1
,
const
_B16x4
&
inp2
)
{
[[
maybe_unused
]]
union
tmpcvt
{
uint16_t
u
;
_Float16
f
;
__hip_bfloat16
b
;
}
t1
,
t2
,
res
;
_B16x4
ret
;
if
constexpr
(
std
::
is_same
<
T
,
_Float16
>::
value
)
{
union
h2cvt
{
...
...
@@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
constexpr
int
GQA_RATIO4
=
DIVIDE_ROUND_UP
(
GQA_RATIO
,
4
);
[[
maybe_unused
]]
__shared__
float
shared_qk_max
[
NWARPS
][
16
+
1
];
[[
maybe_unused
]]
__shared__
float
shared_exp_sum
[
NWARPS
][
16
+
1
];
// shared_logits is used for multiple purposes
__shared__
_B16x4
shared_logits
[
NWARPS
][
4
][
16
][
4
];
...
...
@@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const
cache_t
*
k_ptr2
=
k_ptr
+
kblock_number
*
kv_block_stride
;
const
int
klocal_token_idx
=
TOKENS_PER_WARP
*
warpid
+
token_depth
*
16
+
lane16id
;
[[
maybe_unused
]]
const
int
kglobal_token_idx
=
partition_start_token_idx
+
klocal_token_idx
;
const
int
kphysical_block_offset
=
klocal_token_idx
%
BLOCK_SIZE
;
const
cache_t
*
k_ptr3
=
k_ptr2
+
kphysical_block_offset
*
KX
;
...
...
@@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const
int
context_len
=
context_lens
[
seq_idx
];
const
int
num_partitions
=
DIVIDE_ROUND_UP
(
context_len
,
PARTITION_SIZE
);
[[
maybe_unused
]]
constexpr
int
NUM_WARPS
=
NUM_THREADS
/
WARP_SIZE
;
const
auto
warpid
=
threadIdx
.
x
/
WARP_SIZE
;
[[
maybe_unused
]]
const
auto
laneid
=
threadIdx
.
x
%
WARP_SIZE
;
__shared__
float
shared_global_exp_sum
;
// max num partitions supported is warp_size * NPAR_LOOPS
...
...
@@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const
int
context_len
=
context_lens
[
seq_idx
];
const
int
num_partitions
=
DIVIDE_ROUND_UP
(
context_len
,
PARTITION_SIZE
);
[[
maybe_unused
]]
constexpr
int
NUM_WARPS
=
NUM_THREADS
/
WARP_SIZE
;
const
int
warpid
=
threadIdx
.
x
/
WARP_SIZE
;
[[
maybe_unused
]]
const
int
laneid
=
threadIdx
.
x
%
WARP_SIZE
;
__shared__
float
shared_global_exp_sum
;
// max num partitions supported is warp_size * NPAR_LOOPS
...
...
@@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const
int
context_len
=
context_lens
[
seq_idx
];
const
int
num_partitions
=
DIVIDE_ROUND_UP
(
context_len
,
PARTITION_SIZE
);
[[
maybe_unused
]]
constexpr
int
NUM_WARPS
=
NUM_THREADS
/
WARP_SIZE
;
const
int
warpid
=
threadIdx
.
x
/
WARP_SIZE
;
[[
maybe_unused
]]
const
int
laneid
=
threadIdx
.
x
%
WARP_SIZE
;
__shared__
float
shared_global_exp_sum
;
// max num partitions supported is warp_size * NPAR_LOOPS
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment