Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8c0d15d5
Unverified
Commit
8c0d15d5
authored
Mar 14, 2025
by
Lu Fang
Committed by
GitHub
Mar 15, 2025
Browse files
[Misc][Easy] Annotate unused vars in the csrc files (#14798)
Signed-off-by:
Lu Fang
<
lufang@fb.com
>
parent
97ac781c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
14 additions
and
13 deletions
+14
-13
csrc/prepare_inputs/advance_step.cu
csrc/prepare_inputs/advance_step.cu
+1
-1
csrc/quantization/fp8/amd/quant_utils.cuh
csrc/quantization/fp8/amd/quant_utils.cuh
+1
-1
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/q_gemm.cu
+8
-8
csrc/rocm/attention.cu
csrc/rocm/attention.cu
+4
-3
No files found.
csrc/prepare_inputs/advance_step.cu
View file @
8c0d15d5
...
...
@@ -274,7 +274,7 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute
(
&
blocks
,
cudaDevAttrMultiProcessorCount
,
dev
);
cudaDeviceGetAttribute
(
&
threads
,
cudaDevAttrMaxThreadsPerBlock
,
dev
);
int
block_tables_stride
=
block_tables
.
stride
(
0
);
[[
maybe_unused
]]
int
block_tables_stride
=
block_tables
.
stride
(
0
);
TORCH_CHECK
((
blocks
*
threads
>
num_queries
),
"multi-step: not enough threads to map to num_queries = "
,
num_queries
,
" block_tables.stride(0) = "
,
block_tables
.
stride
(
0
),
...
...
csrc/quantization/fp8/amd/quant_utils.cuh
View file @
8c0d15d5
...
...
@@ -446,7 +446,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
template
<
>
__inline__
__device__
uint32_t
scaled_vec_conversion
<
uint32_t
,
uint16_t
>
(
const
uint16_t
&
a
,
float
scale
)
{
__half2_raw
h2r
=
[[
maybe_unused
]]
__half2_raw
h2r
=
__hip_cvt_fp8x2_to_halfraw2
(
a
,
fp8_type
::
__default_interpret
);
union
{
__half2_raw
h2r
;
...
...
csrc/quantization/gptq/q_gemm.cu
View file @
8c0d15d5
...
...
@@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
int
offset_m
=
blockIdx
.
y
*
m_count
;
int
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
int
offset_m
=
blockIdx
.
y
*
m_count
;
int
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
int
offset_m
=
blockIdx
.
y
*
m_count
;
int
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
@@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
int
offset_m
=
blockIdx
.
y
*
m_count
;
int
offset_k
=
blockIdx
.
z
*
BLOCK_KN_SIZE
;
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
[[
maybe_unused
]]
int
end_n
=
min
(
offset_n
+
BLOCK_KN_SIZE
*
4
,
size_n
);
[[
maybe_unused
]]
int
end_m
=
min
(
offset_m
+
m_count
,
size_m
);
int
end_k
=
min
(
offset_k
+
BLOCK_KN_SIZE
,
size_k
);
int
n
=
offset_n
+
t
*
4
;
...
...
csrc/rocm/attention.cu
View file @
8c0d15d5
...
...
@@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
constexpr
int
GQA_RATIO4
=
DIVIDE_ROUND_UP
(
GQA_RATIO
,
4
);
__shared__
float
shared_qk_max
[
NWARPS
][
16
+
1
];
__shared__
float
shared_exp_sum
[
NWARPS
][
16
+
1
];
[[
maybe_unused
]]
__shared__
float
shared_qk_max
[
NWARPS
][
16
+
1
];
[[
maybe_unused
]]
__shared__
float
shared_exp_sum
[
NWARPS
][
16
+
1
];
// shared_logits is used for multiple purposes
__shared__
_B16x4
shared_logits
[
NWARPS
][
4
][
16
][
4
];
...
...
@@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const
cache_t
*
k_ptr2
=
k_ptr
+
kblock_number
*
kv_block_stride
;
const
int
klocal_token_idx
=
TOKENS_PER_WARP
*
warpid
+
token_depth
*
16
+
lane16id
;
const
int
kglobal_token_idx
=
partition_start_token_idx
+
klocal_token_idx
;
[[
maybe_unused
]]
const
int
kglobal_token_idx
=
partition_start_token_idx
+
klocal_token_idx
;
const
int
kphysical_block_offset
=
klocal_token_idx
%
BLOCK_SIZE
;
const
cache_t
*
k_ptr3
=
k_ptr2
+
kphysical_block_offset
*
KX
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment